### Reading the Dataset

In [15]:
import os

os.makedirs("data", exist_ok=True)
data_file = "data/house_tiny.csv"

with open(data_file, "w") as f:
# Structure matters, if you will add 'tab' for each row, it will break the structure and NA values will not be read correctly
    f.write('''NumRooms,RoofType,Price
NA,NA,127500
2,NA,106000
4,Slate,178100
NA,NA,140000''')

In [16]:
import pandas

data = pandas.read_csv(data_file, keep_default_na=True)
data

Unnamed: 0,NumRooms,RoofType,Price
0,,,127500
1,2.0,,106000
2,4.0,Slate,178100
3,,,140000


### Data preparation

RoofType coversion to _Slate and _nan (dealing with missing values)

In [17]:
# iloc[:, 0:2] 
#    x = : - all rows
#    y = 0:2 - columns from index 0 to 1 (column where pandas should stop is NOT included)
#    for more than one column - column names are also displayed on print
inputs, targets = data.iloc[:, 0:2], data.iloc[:, 2]

print(inputs)
print("\n")
print(targets)

   NumRooms RoofType
0       NaN      NaN
1       2.0      NaN
2       4.0    Slate
3       NaN      NaN


0    127500
1    106000
2    178100
3    140000
Name: Price, dtype: int64


For categorical input fields, we can treat NaN as a category.

Since the RoofType column takes values Slate and NaN, pandas can convert this column into two columns RoofType_Slate and RoofType_nan.

In [18]:
# get_dummies convert categorical variable into dummy/indicator variables.
inputs = pandas.get_dummies(inputs, dummy_na=True)
inputs

Unnamed: 0,NumRooms,RoofType_Slate,RoofType_nan
0,,False,True
1,2.0,False,True
2,4.0,True,False
3,,False,True


For missing numerical values, one common heuristic is to replace the NaN entries with the mean value of the corresponding column.

In [19]:
# fillna() - fill NA/NaN values using the specified method.
inputs = inputs.fillna(inputs.mean())
inputs

Unnamed: 0,NumRooms,RoofType_Slate,RoofType_nan
0,3.0,False,True
1,2.0,False,True
2,4.0,True,False
3,3.0,False,True


### Conversion to tensor

Now that all the entries in inputs and targets are numerical (bool can be represented in 1 - 0 format), we can load them into a tensor

In [20]:
import torch

X = torch.tensor(inputs.to_numpy(dtype=float))
Y = torch.tensor(targets.to_numpy(dtype=float))
X, Y

(tensor([[3., 0., 1.],
         [2., 0., 1.],
         [4., 1., 0.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500., 106000., 178100., 140000.], dtype=torch.float64))

### Exercises

In [33]:
from ucimlrepo import fetch_ucirepo

abalone = fetch_ucirepo(id=1)
data: pandas.DataFrame = abalone.data
X = data.features
Y = data.targets
metadata = abalone.metadata
features = abalone.features

print(X)
print(Y)
print(metadata)
print(features)

     Sex  Length  Diameter  Height  Whole_weight  Shucked_weight  \
0      M   0.455     0.365   0.095        0.5140          0.2245   
1      M   0.350     0.265   0.090        0.2255          0.0995   
2      F   0.530     0.420   0.135        0.6770          0.2565   
3      M   0.440     0.365   0.125        0.5160          0.2155   
4      I   0.330     0.255   0.080        0.2050          0.0895   
...   ..     ...       ...     ...           ...             ...   
4172   F   0.565     0.450   0.165        0.8870          0.3700   
4173   M   0.590     0.440   0.135        0.9660          0.4390   
4174   M   0.600     0.475   0.205        1.1760          0.5255   
4175   F   0.625     0.485   0.150        1.0945          0.5310   
4176   M   0.710     0.555   0.195        1.9485          0.9455   

      Viscera_weight  Shell_weight  
0             0.1010        0.1500  
1             0.0485        0.0700  
2             0.1415        0.2100  
3             0.1140        0.1550 

Check for null values

In [34]:
X.isnull()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
4172,False,False,False,False,False,False,False,False
4173,False,False,False,False,False,False,False,False
4174,False,False,False,False,False,False,False,False
4175,False,False,False,False,False,False,False,False


In [35]:
X.isnull().values

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [36]:
X.isnull().values.any()

False

Check for N/A values

In [37]:
X.isna()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
4172,False,False,False,False,False,False,False,False
4173,False,False,False,False,False,False,False,False
4174,False,False,False,False,False,False,False,False
4175,False,False,False,False,False,False,False,False


In [38]:
X.isna().values

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [39]:
X.isna().values.any()

False