# Module 2.4 Setting up the validation framework

## Setup

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Load cleaned data

In [2]:
df = pd.read_csv('02-regression/notebooks/data_cleaned.csv')
df.sample(5)

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
2419,volkswagen,cc,2017,premium_unleaded_(recommended),200.0,4.0,automated_manual,front_wheel_drive,4.0,performance,midsize,sedan,31,22,873,37820
1196,honda,accord_crosstour,2010,regular_unleaded,271.0,6.0,automatic,front_wheel_drive,4.0,"crossover,hatchback",midsize,4dr_hatchback,27,18,2202,32570
3747,ford,e-series_wagon,2013,flex-fuel_(unleaded/e85),255.0,8.0,automatic,rear_wheel_drive,3.0,flex_fuel,midsize,passenger_van,15,11,5657,35085
8651,acura,rsx,2005,regular_unleaded,160.0,4.0,automatic,front_wheel_drive,2.0,"hatchback,luxury,performance",compact,2dr_hatchback,31,22,204,21175
1163,gmc,acadia,2017,regular_unleaded,194.0,4.0,automatic,front_wheel_drive,4.0,crossover,midsize,4dr_suv,26,21,549,32450


## Splitting data
We want to split data to isolate training, validation, and testing (e.g. 60/20/20 splits).

In [3]:
# Number of records in the dataset
n = len(df)
print(f"Number of records: {n}")

# 20% of the total number of records
print(f"20% of total records: {n * 0.2}\n")

# Calculate size of each data set split
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test
print(f"Training set size: {n_train}")
print(f"Validation set size: {n_val}")
print(f"Test set size: {n_test}")
print(f"\nTotal: {n_train + n_val + n_test}")

Number of records: 11914
20% of total records: 2382.8

Training set size: 7150
Validation set size: 2382
Test set size: 2382

Total: 11914


In [4]:
# We need to shuffle the data frame so that the split sets are randomly selected
# First create a range of numbers from 0 to n-1
idx = np.arange(n)
np.random.seed(42)  # Set the random seed for reproducibility
np.random.shuffle(idx)  # Shuffle the indices

idx[:5]

array([3995, 7474, 7300, 3148,  747])

In [5]:
# Create the sub sets
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train + n_val]]
df_test = df.iloc[idx[n_train + n_val:]]

In [6]:
df_train.head(8)

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
3995,gmc,envoy_xl,2005,regular_unleaded,275.0,6.0,automatic,rear_wheel_drive,4.0,,large,4dr_suv,18,13,549,29695
7474,volkswagen,passat,2016,regular_unleaded,170.0,4.0,automatic,front_wheel_drive,4.0,,midsize,sedan,38,25,873,30495
7300,honda,odyssey,2016,regular_unleaded,248.0,6.0,automatic,front_wheel_drive,4.0,,large,passenger_minivan,28,19,2202,37650
3148,chevrolet,cruze,2015,regular_unleaded,138.0,4.0,manual,front_wheel_drive,4.0,,midsize,sedan,36,25,1385,16170
747,volvo,740,1991,regular_unleaded,162.0,4.0,automatic,rear_wheel_drive,4.0,"luxury,performance",midsize,sedan,20,17,870,2000
4048,suzuki,equator,2012,regular_unleaded,152.0,4.0,manual,rear_wheel_drive,4.0,,compact,extended_cab_pickup,23,19,481,19299
4759,ford,flex,2016,premium_unleaded_(recommended),365.0,6.0,automatic,all_wheel_drive,4.0,crossover,large,wagon,21,15,5657,42600
6423,acura,legend,1994,regular_unleaded,230.0,6.0,manual,front_wheel_drive,2.0,"luxury,performance",midsize,coupe,24,16,204,2384


In [7]:
len(df_train), len(df_val), len(df_test), len(
    df_train) + len(df_val) + len(df_test)

(7150, 2382, 2382, 11914)

In [8]:
# Reset the indices as we don't need the original indices anymore
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [9]:
df_train.head(8)

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
0,gmc,envoy_xl,2005,regular_unleaded,275.0,6.0,automatic,rear_wheel_drive,4.0,,large,4dr_suv,18,13,549,29695
1,volkswagen,passat,2016,regular_unleaded,170.0,4.0,automatic,front_wheel_drive,4.0,,midsize,sedan,38,25,873,30495
2,honda,odyssey,2016,regular_unleaded,248.0,6.0,automatic,front_wheel_drive,4.0,,large,passenger_minivan,28,19,2202,37650
3,chevrolet,cruze,2015,regular_unleaded,138.0,4.0,manual,front_wheel_drive,4.0,,midsize,sedan,36,25,1385,16170
4,volvo,740,1991,regular_unleaded,162.0,4.0,automatic,rear_wheel_drive,4.0,"luxury,performance",midsize,sedan,20,17,870,2000
5,suzuki,equator,2012,regular_unleaded,152.0,4.0,manual,rear_wheel_drive,4.0,,compact,extended_cab_pickup,23,19,481,19299
6,ford,flex,2016,premium_unleaded_(recommended),365.0,6.0,automatic,all_wheel_drive,4.0,crossover,large,wagon,21,15,5657,42600
7,acura,legend,1994,regular_unleaded,230.0,6.0,manual,front_wheel_drive,2.0,"luxury,performance",midsize,coupe,24,16,204,2384


In [10]:
# Normalise the target variable
y_train = np.log1p(df_train.msrp.values)
y_val = np.log1p(df_val.msrp.values)
y_test = np.log1p(df_test.msrp.values)

# Print the first 5 values of each target variable
print("y_train:", y_train[:5])
print("y_val:", y_val[:5])
print("y_test:", y_test[:5])

y_train: [10.29876764 10.32535081 10.53611479  9.69097479  7.60140233]
y_val: [10.35840927 10.75344616 10.09352923  9.99016971 10.42141788]
y_test: [10.28622977 10.14155912 10.50687413 10.12246243 10.8277663 ]


In [11]:
# We don't need the target variable in the data frames anymore
df_train = df_train.drop('msrp', axis=1)
df_val = df_val.drop('msrp', axis=1)
df_test = df_test.drop('msrp', axis=1)

In [12]:
print((df_train.columns == 'msrp').sum())

0


## Export data sets

In [13]:
df_train.to_csv('02-regression/notebooks/data_train.csv', index=False)
df_val.to_csv('02-regression/notebooks/data_val.csv', index=False)
df_test.to_csv('02-regression/notebooks/data_test.csv', index=False)