In [1]:
import torch
print(torch.__version__) 

2.7.0+cpu


<p style="font-family:ComicSansMS; font-size: 30px; color: magenta"> 2.2. Data Preprocessing</p>

<p style="font-family:ComicSansMS; font-size: 24px; color: orange"> 2.2.1. Reading the Dataset</p>

In [2]:
import os

os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
    f.write('''NumRooms,RoofType,Price
NA,NA,127500
2,NA,106000
4,Slate,178100
NA,NA,140000''')

In [3]:
import pandas as pd

data = pd.read_csv(data_file)
print(data)

   NumRooms RoofType   Price
0       NaN      NaN  127500
1       2.0      NaN  106000
2       4.0    Slate  178100
3       NaN      NaN  140000


<p style="font-family:ComicSansMS; font-size: 24px; color: orange"> 2.2.2. Data Preparation</p>

In [4]:
inputs, targets = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = pd.get_dummies(inputs, dummy_na=True)
print(inputs)

   NumRooms  RoofType_Slate  RoofType_nan
0       NaN           False          True
1       2.0           False          True
2       4.0            True         False
3       NaN           False          True


In [5]:
# For missing numerical values, one common heuristic is to replace the NaN entries 
# with the mean value of the corresponding column.
inputs = inputs.fillna(inputs.mean())
print(inputs)

   NumRooms  RoofType_Slate  RoofType_nan
0       3.0           False          True
1       2.0           False          True
2       4.0            True         False
3       3.0           False          True


<p style="font-family:ComicSansMS; font-size: 24px; color: orange"> 2.2.3. Conversion to the Tensor Format</p>

In [7]:
# Now that all the entries in inputs and targets are numerical, we can load them into a tensor
X = torch.tensor(inputs.to_numpy(dtype=float))
y = torch.tensor(targets.to_numpy(dtype=float))
X, y

(tensor([[3., 0., 1.],
         [2., 0., 1.],
         [4., 1., 0.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500., 106000., 178100., 140000.], dtype=torch.float64))

<p style="font-family:ComicSansMS; font-size: 24px; color: orange"> 2.2.5. Exercises</p>

In [19]:
# Try loading datasets, e.g., Abalone from the UCI Machine Learning Repository and inspect their properties. 
df = pd.read_csv('../data/Bike Sharing day.csv', na_values=['?', '--'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  casual      731 non-null    int64  
 14  registered  731 non-null    int64  
 15  cnt         731 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.5+ KB


In [20]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [22]:
# Counting non-null values in each column
df.count()

instant       731
dteday        731
season        731
yr            731
mnth          731
holiday       731
weekday       731
workingday    731
weathersit    731
temp          731
atemp         731
hum           731
windspeed     731
casual        731
registered    731
cnt           731
dtype: int64

In [23]:
# 2. Try indexing and selecting data columns by name rather than by column number. 
# The pandas documentation on indexing has further details on how to do this.

In [24]:
data = {'Name': ['John', 'Anna', 'Peter', 'Linda'],
        'Age': [28, 34, 29, 32],
        'City': ['New York', 'Paris', 'Berlin', 'London']}
df = pd.DataFrame(data)

In [None]:
# Example 1: Basic Selection
print(df.loc[0])

Name        John
Age           28
City    New York
Name: 0, dtype: object


In [26]:
# Example 2: Select Multiple Rows
print(df.loc[[0, 2]])

    Name  Age      City
0   John   28  New York
2  Peter   29    Berlin


In [27]:
# Example 3: Slicing Rows
print(df.loc[1:3])

    Name  Age    City
1   Anna   34   Paris
2  Peter   29  Berlin
3  Linda   32  London


In [28]:
# Example 4: Selecting Rows and Columns
print(df.loc[0, 'Name'])
print(df.loc[[1, 3], ['Name', 'City']])

John
    Name    City
1   Anna   Paris
3  Linda  London


In [29]:
# Example 5: Conditional Selection
print(df.loc[df['Age'] > 30])

    Name  Age    City
1   Anna   34   Paris
3  Linda   32  London


In [30]:
# Example 6: Setting Values
df.loc[0, 'Age'] = 29
print(df)

    Name  Age      City
0   John   29  New York
1   Anna   34     Paris
2  Peter   29    Berlin
3  Linda   32    London


In [None]:
# 3. How large a dataset do you think you could load this way? What might be the limitations? 
# Hint: consider the time to read the data, representation, processing, and memory footprint. 
# Try this out on your laptop. What happens if you try it out on a server?



In [31]:

# 4. How would you deal with data that has a very large number of categories? 
# What if the category labels are all unique? Should you include the latter?
df = pd.DataFrame({
   'Name': ['Alice', 'Bob', 'Charlie'],
   'Age': [25, 30, 35]
})
print(df)

      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35


In [32]:
print(df.index)

RangeIndex(start=0, stop=3, step=1)


In [33]:
# Example 3: Listing Unique Row Labels
print(df.index.unique())

RangeIndex(start=0, stop=3, step=1)


In [34]:
# Example 4: Using a Conditional to Filter Row Labels
df.reset_index(inplace=True)
df['Gender'] = ['F', 'M', 'M']

df.set_index('Gender', inplace=True)
print(df.index[df.index == 'M'])

Index(['M', 'M'], dtype='object', name='Gender')


In [35]:
data = {'Name': ['John', 'Anna', 'Peter', 'Linda'], 'Age': [28, 34, 29, 32], 'City': ['New York', 'Paris', 'Berlin', 'London']}
df = pd.DataFrame(data)
print(df.columns)

Index(['Name', 'Age', 'City'], dtype='object')


In [36]:
# Iterating Over Columns
for col in df.columns:
    print(col)

Name
Age
City


In [None]:
# 5. What alternatives to pandas can you think of? How about loading NumPy tensors from a file? 
# Check out Pillow, the Python Imaging Library.