In [1]:
!python3 -m pip install --user scikit-learn
!pip install torch



In [2]:
# You can use any Python source file as a module by executing an import statement in some other Python source file
# The import statement combines two operations; it searches for the named module, then it binds the
# results of that search to a name in the local scope.
import os
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


print("torch version: ",torch.__version__)

torch version:  2.1.0+cu118


In [41]:
from google.colab import drive

drive.mount('/content/gdrive')
os.chdir("/content/gdrive/MyDrive/Colab Notebooks/tf-torch")
!pwd

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/Colab Notebooks/tf-torch


In [42]:
housing_df = pd.read_csv('data/housing_pre-proc_toy.csv', on_bad_lines='skip')
# housing_df.head()
# housing_df.info()
# housing_df.describe().transpose()

In [40]:
# Let's split the dataset into train, validation, and test sets
train, test = train_test_split(housing_df, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

1600 train examples
400 validation examples
500 test examples


In [54]:
class DFDataset(Dataset):
    def __init__(self, dataframe, label_col=None):
        '''Initialize the dataset with a dataframe and optional label column'''
        self.dataframe = dataframe.copy()
        self.label_col = label_col
        # Set label
        if label_col:
            self.labels = torch.tensor(self.dataframe.pop(self.label_col).values, dtype=torch.float32)
        # Feature engineering
        self._feature_engineering()
        # Set features
        self.features = {key: torch.tensor(value.values) for key, value in self.dataframe.items()}

    def _feature_engineering(self):
        for col in self.dataframe:
            # Normalize numeric columns to a scale between 0 and 1
            if self.dataframe[col].dtype.kind in 'biufc':  # Check if the column is numeric
                mini = self.dataframe[col].min()
                maxi = self.dataframe[col].max()
                self.dataframe[col] = (self.dataframe[col] - mini) / (maxi - mini)
            # Convert categorical columns (object type) to integer labels
            elif self.dataframe[col].dtype == object:  # Check if the column is object type
                le = LabelEncoder()
                self.dataframe[col] = le.fit_transform(self.dataframe[col])

    def __len__(self):
        '''Return the total number of samples in the dataset'''
        return len(next(iter(self.features.values())))

    def __getitem__(self, idx):
        '''Return the features and optional labels for a given index'''
        if self.label_col:
            return {key: val[idx] for key, val in self.features.items()}, self.labels[idx]
        else:
            return {key: val[idx] for key, val in self.features.items()}

In [55]:
def df_to_dataset(dataframe, label_col='median_house_value', shuffle=True, batch_size=32):
    dataset = DFDataset(dataframe, label_col=label_col)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [56]:
batch_size = 32
train_ds = df_to_dataset(train)
val_ds = df_to_dataset(val, shuffle=False)

In [57]:
# TODO 1b
feature_batch, label_batch = next(iter(train_ds))
print('Every feature:', list(feature_batch.keys()))
print('A batch of households:', feature_batch['households'])
print('A batch of ocean_proximity:', feature_batch['ocean_proximity'])
print('A batch of targets:', label_batch)

Every feature: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity']
A batch of households: tensor([0.1176, 0.0616, 0.0665, 0.0933, 0.3874, 0.0768, 0.3601, 0.0887, 0.1260,
        0.1584, 0.2409, 0.0814, 0.2044, 0.1054, 0.0649, 0.3082, 0.0711, 0.4831,
        0.1300, 0.1125, 0.0898, 0.1554, 0.1765, 0.0784, 0.0952, 0.0849, 0.1395,
        0.2003, 0.0692, 0.0597, 0.0238, 0.0887], dtype=torch.float64)
A batch of ocean_proximity: tensor([0.3333, 0.3333, 0.6667, 0.6667, 0.3333, 0.6667, 0.6667, 0.3333, 0.6667,
        0.6667, 0.6667, 0.3333, 0.3333, 0.3333, 0.3333, 0.3333, 0.3333, 0.6667,
        0.3333, 0.3333, 0.3333, 0.3333, 0.3333, 0.3333, 0.3333, 0.6667, 0.3333,
        0.3333, 0.3333, 0.3333, 0.3333, 0.6667], dtype=torch.float64)
A batch of targets: tensor([133800.,  52600., 202700., 128600.,  64700.,  81700., 166400., 107800.,
        256300., 175900., 235700.,  89500.,  71400., 500001.,  74300.