# Machine Learning on the Shoes Dataset

In [205]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("Shoe prices.csv")
df

Unnamed: 0,Brand,Model,Type,Gender,Size,Color,Material,Price (USD)
0,Nike,Air Jordan 1,Basketball,Men,US 10,Red/Black,Leather,$170.00
1,Adidas,Ultra Boost 21,Running,Men,US 9.5,Black,Primeknit,$180.00
2,Reebok,Classic Leather,Casual,Men,US 11,White,Leather,$75.00
3,Converse,Chuck Taylor,Casual,Women,US 8,Navy,Canvas,$55.00
4,Puma,Future Rider,Lifestyle,Women,US 7.5,Pink,Mesh,$80.00
...,...,...,...,...,...,...,...,...
1001,New Balance,Fresh Foam 880v11,Running,Women,US 10,Grey,Mesh,$130.00
1002,Asics,Gel-Kayano Lite,Running,Men,US 9.5,Black,Mesh,$160.00
1003,Fila,Venom 94,Fashion,Women,US 7.5,White,Leather,$70.00
1004,Skechers,Summits,Training,Men,US 8,Grey,Mesh,$55.00


## Data Cleansing

Check for **Null** Values

In [6]:
df.isnull().sum()

Brand          0
Model          0
Type           0
Gender         0
Size           0
Color          0
Material       0
Price (USD)    0
dtype: int64

Clean up the **Price** and **Size** Columns

In [7]:
df['Size'] = df['Size'].str.replace('US ', '').astype(float)

In [8]:
df['Price (USD)'] = df['Price (USD)'].str.replace('$', '').astype(float)

  df['Price (USD)'] = df['Price (USD)'].str.replace('$', '').astype(float)


In [9]:
df[['Price (USD)', 'Size']].dtypes

Price (USD)    float64
Size           float64
dtype: object

Dropping the **'Model'** Column

This column adds unneccessary noise to the model. It has over 200 unique values that would be too computationally expensive to categorically encode. Also, info in the 'Model' column is largely captured by the 'Brand' column

In [10]:
df = df.drop(columns=('Model'))
df

Unnamed: 0,Brand,Type,Gender,Size,Color,Material,Price (USD)
0,Nike,Basketball,Men,10.0,Red/Black,Leather,170.0
1,Adidas,Running,Men,9.5,Black,Primeknit,180.0
2,Reebok,Casual,Men,11.0,White,Leather,75.0
3,Converse,Casual,Women,8.0,Navy,Canvas,55.0
4,Puma,Lifestyle,Women,7.5,Pink,Mesh,80.0
...,...,...,...,...,...,...,...
1001,New Balance,Running,Women,10.0,Grey,Mesh,130.0
1002,Asics,Running,Men,9.5,Black,Mesh,160.0
1003,Fila,Fashion,Women,7.5,White,Leather,70.0
1004,Skechers,Training,Men,8.0,Grey,Mesh,55.0


Splitting the **Color** column

In [11]:
# Step 1: Splitting the Color column
df[['Color1', 'Color2', 'Color3']] = df['Color'].str.split('/', n=2, expand=True)
df['Color2'] = df['Color2'].fillna('No Secondary Color')
df['Color3'] = df['Color3'].fillna('No Tertiary Color')

One-Hot Encoding the **3 Color** Columns

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Specify the columns to include in the transformation
columns_to_encode = ['Color1', 'Color2', 'Color3']

# Step 2: One-hot encoding
column_transformer = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(), columns_to_encode)
    ],
    remainder='passthrough'
)

transformed_data = column_transformer.fit_transform(df[columns_to_encode])

# Retrieve the feature names from the OneHotEncoder
feature_names = column_transformer.named_transformers_['one_hot_encoder'].get_feature_names_out(columns_to_encode)

# Convert the transformed data back to a DataFrame
transformed_df = pd.DataFrame(transformed_data.toarray(), columns=feature_names)

# Concatenate the transformed DataFrame with the remaining columns
df = pd.concat([transformed_df, df.drop(columns_to_encode, axis=1)], axis=1)

# Display the encoded DataFrame with labeled columns
df


Unnamed: 0,Color1_Beige,Color1_Black,Color1_Blue,Color1_Brown,Color1_Burgundy,Color1_Charcoal,Color1_Checkerboard,Color1_Checkerboard Black,Color1_Checkered,Color1_Cinder,...,Color3_Navy,Color3_No Tertiary Color,Color3_Red,Brand,Type,Gender,Size,Color,Material,Price (USD)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,Nike,Basketball,Men,10.0,Red/Black,Leather,170.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,Adidas,Running,Men,9.5,Black,Primeknit,180.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,Reebok,Casual,Men,11.0,White,Leather,75.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,Converse,Casual,Women,8.0,Navy,Canvas,55.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,Puma,Lifestyle,Women,7.5,Pink,Mesh,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,New Balance,Running,Women,10.0,Grey,Mesh,130.0
1002,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,Asics,Running,Men,9.5,Black,Mesh,160.0
1003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,Fila,Fashion,Women,7.5,White,Leather,70.0
1004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,Skechers,Training,Men,8.0,Grey,Mesh,55.0


Some of the models couldn't be ran or had significantly lower accuracies when the **Color** columns were included. These next 3 cells are subject to be used or commented out depending on the model. Each model that removes the columns will have a **clear disclaimer**.

In [13]:
# df =df.drop(columns=(['Color3_Blue', 'Color3_Navy', 'Color3_No Tertiary Color', 'Color3_Red']), axis=1)

In [14]:
# df = df.drop(columns=(['Color2_Black', 'Color2_Blue', 'Color2_Gold', 'Color2_Green',
#        'Color2_Grey', 'Color2_Gum', 'Color2_Navy', 'Color2_No Secondary Color',
#        'Color2_Orange', 'Color2_Pink', 'Color2_Purple', 'Color2_Red',
#        'Color2_White', 'Color2_White Checkerboard', 'Color2_Yellow']))

In [15]:
# df = df.drop(columns=(['Color1_Black', 'Color1_Blue',
#        'Color1_Brown', 'Color1_Burgundy', 'Color1_Charcoal',
#        'Color1_Checkerboard', 'Color1_Checkerboard Black', 'Color1_Checkered',
#        'Color1_Cinder', 'Color1_Clay Brown', 'Color1_Cloud White',
#        'Color1_Collegiate Navy', 'Color1_Cream', 'Color1_Cream White',
#        'Color1_Egret', 'Color1_Green', 'Color1_Grey', 'Color1_Ivory',
#        'Color1_Khaki', 'Color1_Multi-color', 'Color1_Natural',
#        'Color1_Natural Ivory', 'Color1_Navy', 'Color1_Orange', 'Color1_Pink',
#        'Color1_Purple', 'Color1_Red', 'Color1_Silver', 'Color1_Sunflower',
#        'Color1_True White', 'Color1_White', 'Color1_Yellow', 'Color1_Zebra', 'Color1_Beige']))

In [16]:
df.columns

Index(['Color1_Beige', 'Color1_Black', 'Color1_Blue', 'Color1_Brown',
       'Color1_Burgundy', 'Color1_Charcoal', 'Color1_Checkerboard',
       'Color1_Checkerboard Black', 'Color1_Checkered', 'Color1_Cinder',
       'Color1_Clay Brown', 'Color1_Cloud White', 'Color1_Collegiate Navy',
       'Color1_Cream', 'Color1_Cream White', 'Color1_Egret', 'Color1_Green',
       'Color1_Grey', 'Color1_Ivory', 'Color1_Khaki', 'Color1_Multi-color',
       'Color1_Natural', 'Color1_Natural Ivory', 'Color1_Navy',
       'Color1_Orange', 'Color1_Pink', 'Color1_Purple', 'Color1_Red',
       'Color1_Silver', 'Color1_Sunflower', 'Color1_True White',
       'Color1_White', 'Color1_Yellow', 'Color1_Zebra', 'Color2_Black',
       'Color2_Blue', 'Color2_Gold', 'Color2_Green', 'Color2_Grey',
       'Color2_Gum', 'Color2_Navy', 'Color2_No Secondary Color',
       'Color2_Orange', 'Color2_Pink', 'Color2_Purple', 'Color2_Red',
       'Color2_White', 'Color2_White Checkerboard', 'Color2_Yellow',
       'Color3_Bl

Dropping the Original **Color** column

In [17]:
df = df.drop(columns=('Color'))
df

Unnamed: 0,Color1_Beige,Color1_Black,Color1_Blue,Color1_Brown,Color1_Burgundy,Color1_Charcoal,Color1_Checkerboard,Color1_Checkerboard Black,Color1_Checkered,Color1_Cinder,...,Color3_Blue,Color3_Navy,Color3_No Tertiary Color,Color3_Red,Brand,Type,Gender,Size,Material,Price (USD)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,Nike,Basketball,Men,10.0,Leather,170.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,Adidas,Running,Men,9.5,Primeknit,180.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,Reebok,Casual,Men,11.0,Leather,75.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,Converse,Casual,Women,8.0,Canvas,55.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,Puma,Lifestyle,Women,7.5,Mesh,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,New Balance,Running,Women,10.0,Mesh,130.0
1002,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,Asics,Running,Men,9.5,Mesh,160.0
1003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,Fila,Fashion,Women,7.5,Leather,70.0
1004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,Skechers,Training,Men,8.0,Mesh,55.0


One-Hot Encoding the **Brand** Column

In [18]:
# Specify the columns to include in the transformation
columns_to_encode = ['Brand']

# Step 2: One-hot encoding
column_transformer = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(), columns_to_encode)
    ],
    remainder='passthrough'
)

transformed_data = column_transformer.fit_transform(df[columns_to_encode])

# Retrieve the feature names from the OneHotEncoder
feature_names = column_transformer.named_transformers_['one_hot_encoder'].get_feature_names_out(columns_to_encode)

# Convert the transformed data back to a DataFrame
transformed_df = pd.DataFrame(transformed_data.toarray(), columns=feature_names)

# Concatenate the transformed DataFrame with the remaining columns
df = pd.concat([transformed_df, df.drop(columns_to_encode, axis=1)], axis=1)

# Display the encoded DataFrame with labeled columns
df


Unnamed: 0,Brand_Adidas,Brand_Asics,Brand_Converse,Brand_Fila,Brand_New Balance,Brand_Nike,Brand_Puma,Brand_Reebok,Brand_Skechers,Brand_Vans,...,Color2_Yellow,Color3_Blue,Color3_Navy,Color3_No Tertiary Color,Color3_Red,Type,Gender,Size,Material,Price (USD)
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,Basketball,Men,10.0,Leather,170.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,Running,Men,9.5,Primeknit,180.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,Casual,Men,11.0,Leather,75.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,Casual,Women,8.0,Canvas,55.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,Lifestyle,Women,7.5,Mesh,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,Running,Women,10.0,Mesh,130.0
1002,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,Running,Men,9.5,Mesh,160.0
1003,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,Fashion,Women,7.5,Leather,70.0
1004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,Training,Men,8.0,Mesh,55.0


In [19]:
df.columns

Index(['Brand_Adidas', 'Brand_Asics', 'Brand_Converse', 'Brand_Fila',
       'Brand_New Balance', 'Brand_Nike', 'Brand_Puma', 'Brand_Reebok',
       'Brand_Skechers', 'Brand_Vans', 'Color1_Beige', 'Color1_Black',
       'Color1_Blue', 'Color1_Brown', 'Color1_Burgundy', 'Color1_Charcoal',
       'Color1_Checkerboard', 'Color1_Checkerboard Black', 'Color1_Checkered',
       'Color1_Cinder', 'Color1_Clay Brown', 'Color1_Cloud White',
       'Color1_Collegiate Navy', 'Color1_Cream', 'Color1_Cream White',
       'Color1_Egret', 'Color1_Green', 'Color1_Grey', 'Color1_Ivory',
       'Color1_Khaki', 'Color1_Multi-color', 'Color1_Natural',
       'Color1_Natural Ivory', 'Color1_Navy', 'Color1_Orange', 'Color1_Pink',
       'Color1_Purple', 'Color1_Red', 'Color1_Silver', 'Color1_Sunflower',
       'Color1_True White', 'Color1_White', 'Color1_Yellow', 'Color1_Zebra',
       'Color2_Black', 'Color2_Blue', 'Color2_Gold', 'Color2_Green',
       'Color2_Grey', 'Color2_Gum', 'Color2_Navy', 'Color2_No 

One-Hot Encoding the **Type** Column

In [20]:
# Specify the columns to include in the transformation
columns_to_encode = ['Type']

# Step 2: One-hot encoding
column_transformer = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(), columns_to_encode)
    ],
    remainder='passthrough'
)

transformed_data = column_transformer.fit_transform(df[columns_to_encode])

# Retrieve the feature names from the OneHotEncoder
feature_names = column_transformer.named_transformers_['one_hot_encoder'].get_feature_names_out(columns_to_encode)

# Convert the transformed data back to a DataFrame
transformed_df = pd.DataFrame(transformed_data.toarray(), columns=feature_names)

# Concatenate the transformed DataFrame with the remaining columns
df = pd.concat([transformed_df, df.drop(columns_to_encode, axis=1)], axis=1)

# Display the encoded DataFrame with labeled columns
df

Unnamed: 0,Type_Basketball,Type_Casual,Type_Cross-training,Type_CrossFit,Type_Crossfit,Type_Fashion,Type_Hiking,Type_Lifestyle,Type_Racing,Type_Retro,...,Color2_White Checkerboard,Color2_Yellow,Color3_Blue,Color3_Navy,Color3_No Tertiary Color,Color3_Red,Gender,Size,Material,Price (USD)
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,Men,10.0,Leather,170.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,Men,9.5,Primeknit,180.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,Men,11.0,Leather,75.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,Women,8.0,Canvas,55.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,Women,7.5,Mesh,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,Women,10.0,Mesh,130.0
1002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,Men,9.5,Mesh,160.0
1003,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,Women,7.5,Leather,70.0
1004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,Men,8.0,Mesh,55.0


In [21]:
df.columns

Index(['Type_Basketball', 'Type_Casual', 'Type_Cross-training',
       'Type_CrossFit', 'Type_Crossfit', 'Type_Fashion', 'Type_Hiking',
       'Type_Lifestyle', 'Type_Racing', 'Type_Retro', 'Type_Running',
       'Type_Skate', 'Type_Slides', 'Type_Trail', 'Type_Trail Running',
       'Type_Training', 'Type_Walking', 'Type_Weightlifting', 'Brand_Adidas',
       'Brand_Asics', 'Brand_Converse', 'Brand_Fila', 'Brand_New Balance',
       'Brand_Nike', 'Brand_Puma', 'Brand_Reebok', 'Brand_Skechers',
       'Brand_Vans', 'Color1_Beige', 'Color1_Black', 'Color1_Blue',
       'Color1_Brown', 'Color1_Burgundy', 'Color1_Charcoal',
       'Color1_Checkerboard', 'Color1_Checkerboard Black', 'Color1_Checkered',
       'Color1_Cinder', 'Color1_Clay Brown', 'Color1_Cloud White',
       'Color1_Collegiate Navy', 'Color1_Cream', 'Color1_Cream White',
       'Color1_Egret', 'Color1_Green', 'Color1_Grey', 'Color1_Ivory',
       'Color1_Khaki', 'Color1_Multi-color', 'Color1_Natural',
       'Color1_Natura

One-Hot Encoding the **Gender** Column

In [22]:
# Specify the columns to include in the transformation
columns_to_encode = ['Gender']

# Step 2: One-hot encoding
column_transformer = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(), columns_to_encode)
    ],
    remainder='passthrough'
)

transformed_data = column_transformer.fit_transform(df[columns_to_encode])

# Retrieve the feature names from the OneHotEncoder
feature_names = column_transformer.named_transformers_['one_hot_encoder'].get_feature_names_out(columns_to_encode)

# Convert the transformed data back to a DataFrame
transformed_df = pd.DataFrame(transformed_data, columns=feature_names)

# Concatenate the transformed DataFrame with the remaining columns
df = pd.concat([transformed_df, df.drop(columns_to_encode, axis=1)], axis=1)

# Display the encoded DataFrame with labeled columns
df


Unnamed: 0,Gender_Men,Gender_Women,Type_Basketball,Type_Casual,Type_Cross-training,Type_CrossFit,Type_Crossfit,Type_Fashion,Type_Hiking,Type_Lifestyle,...,Color2_White,Color2_White Checkerboard,Color2_Yellow,Color3_Blue,Color3_Navy,Color3_No Tertiary Color,Color3_Red,Size,Material,Price (USD)
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10.0,Leather,170.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9.5,Primeknit,180.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,11.0,Leather,75.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0,Canvas,55.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7.5,Mesh,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10.0,Mesh,130.0
1002,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9.5,Mesh,160.0
1003,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7.5,Leather,70.0
1004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0,Mesh,55.0


In [23]:
df.columns

Index(['Gender_Men', 'Gender_Women', 'Type_Basketball', 'Type_Casual',
       'Type_Cross-training', 'Type_CrossFit', 'Type_Crossfit', 'Type_Fashion',
       'Type_Hiking', 'Type_Lifestyle', 'Type_Racing', 'Type_Retro',
       'Type_Running', 'Type_Skate', 'Type_Slides', 'Type_Trail',
       'Type_Trail Running', 'Type_Training', 'Type_Walking',
       'Type_Weightlifting', 'Brand_Adidas', 'Brand_Asics', 'Brand_Converse',
       'Brand_Fila', 'Brand_New Balance', 'Brand_Nike', 'Brand_Puma',
       'Brand_Reebok', 'Brand_Skechers', 'Brand_Vans', 'Color1_Beige',
       'Color1_Black', 'Color1_Blue', 'Color1_Brown', 'Color1_Burgundy',
       'Color1_Charcoal', 'Color1_Checkerboard', 'Color1_Checkerboard Black',
       'Color1_Checkered', 'Color1_Cinder', 'Color1_Clay Brown',
       'Color1_Cloud White', 'Color1_Collegiate Navy', 'Color1_Cream',
       'Color1_Cream White', 'Color1_Egret', 'Color1_Green', 'Color1_Grey',
       'Color1_Ivory', 'Color1_Khaki', 'Color1_Multi-color', 'Color1_N

One-Hot Encoding **Material** Column

In [24]:
# Specify the columns to include in the transformation
columns_to_encode = ['Material']

# Step 2: One-hot encoding
column_transformer = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(), columns_to_encode)
    ],
    remainder='passthrough'
)

transformed_data = column_transformer.fit_transform(df[columns_to_encode])

# Retrieve the feature names from the OneHotEncoder
feature_names = column_transformer.named_transformers_['one_hot_encoder'].get_feature_names_out(columns_to_encode)

# Convert the transformed data back to a DataFrame
transformed_df = pd.DataFrame(transformed_data.toarray(), columns=feature_names)

# Concatenate the transformed DataFrame with the remaining columns
df= pd.concat([transformed_df, df.drop(columns_to_encode, axis=1)], axis=1)

# Display the encoded DataFrame with labeled columns
df


Unnamed: 0,Material_Canvas,Material_Canvas/Leather,Material_Canvas/Suede,Material_Flexweave,Material_Flexweave/Cushioning,Material_Flexweave/Knit,Material_Flexweave/Synthetic,Material_Flyknit,Material_Knit,Material_Knit/Synthetic,...,Color2_Red,Color2_White,Color2_White Checkerboard,Color2_Yellow,Color3_Blue,Color3_Navy,Color3_No Tertiary Color,Color3_Red,Size,Price (USD)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10.0,170.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9.5,180.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,11.0,75.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0,55.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7.5,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10.0,130.0
1002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9.5,160.0
1003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7.5,70.0
1004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0,55.0


In [25]:
df.columns

Index(['Material_Canvas', 'Material_Canvas/Leather', 'Material_Canvas/Suede',
       'Material_Flexweave', 'Material_Flexweave/Cushioning',
       'Material_Flexweave/Knit', 'Material_Flexweave/Synthetic',
       'Material_Flyknit', 'Material_Knit', 'Material_Knit/Synthetic',
       ...
       'Color2_Red', 'Color2_White', 'Color2_White Checkerboard',
       'Color2_Yellow', 'Color3_Blue', 'Color3_Navy',
       'Color3_No Tertiary Color', 'Color3_Red', 'Size', 'Price (USD)'],
      dtype='object', length=119)

Since the model was already struggling with memory issues and too much noise, I elected to not break up the **Material** column into 2 different columns. If I were to run this on a cloud-based platform, I'd split this column similar to **Color**.

Split the df into **Features** (X) and **Target Variable** (y)

In [26]:
X = df.iloc[:, :-1].values
X

array([[ 0. ,  0. ,  0. , ...,  1. ,  0. , 10. ],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  9.5],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. , 11. ],
       ...,
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  7.5],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  8. ],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  7.5]])

In [27]:
y = df.iloc[:, -1].values
y

array([170., 180.,  75., ...,  70.,  55., 170.])

Splitting the Dataset Into **Training** and **Test** Sets

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Multiple Linear Regression

**Training** the Model

In [29]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

**Predicting** the **Test** Set Results

In [30]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)

output = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)
formatted_output = [[f'{pred:.2f}' for pred in row] for row in output]

for row in formatted_output:
    print(row)


['107.98', '90.00']
['86.09', '80.00']
['81.01', '90.00']
['81.65', '90.00']
['153.80', '170.00']
['58.62', '70.00']
['89.27', '80.00']
['128.80', '250.00']
['105.24', '130.00']
['65.16', '60.00']
['93.72', '90.00']
['93.72', '90.00']
['131.66', '75.00']
['124.30', '120.00']
['132.11', '160.00']
['172.06', '180.00']
['114.71', '120.00']
['119.89', '175.00']
['126.91', '120.00']
['76.04', '75.00']
['126.16', '120.00']
['132.35', '150.00']
['66.59', '65.00']
['151.25', '120.00']
['66.67', '65.00']
['131.66', '120.00']
['121.54', '175.00']
['135.03', '170.00']
['111.55', '120.00']
['69.22', '65.00']
['72.41', '65.00']
['5919964424751.88', '70.00']
['67.36', '65.00']
['116.69', '130.00']
['90.21', '85.00']
['131.19', '160.00']
['52.12', '60.00']
['64.54', '75.00']
['80.31', '60.00']
['81.01', '60.00']
['21507691891130.86', '55.00']
['111.55', '140.00']
['80.54', '75.00']
['125.84', '200.00']
['79.34', '85.00']
['148.18', '150.00']
['131.66', '160.00']
['137.64', '140.00']
['112.41', '130.0

Calculating Different **Accuracy** Measures

In [31]:
from sklearn.metrics import r2_score,mean_absolute_error, mean_squared_error

# Assuming y_test contains the actual values and y_pred contains the predicted values
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # squared=False returns RMSE
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("The r-squared score (R^2):", r2)

Mean Absolute Error (MAE): 1075473932388.3558
Mean Squared Error (MSE): 1.2689978474429717e+26
Root Mean Squared Error (RMSE): 11264980459117.414
The r-squared score (R^2): -8.259361539486923e+22


# Polynomial Regression

### The **Color** Columns were removed from this model

The model did not work for me at degree >= 4 due to memory issues. Make sure to remove all the **Color** columns before running the model

Splitting the Dataset into a **Training** and **Test** set

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

**Training** the Model

In [33]:
from sklearn.preprocessing import PolynomialFeatures

poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
regressor.fit(X_poly, y_train)

**Predicting** the **Test** Set Results

In [34]:
# Assuming you have trained and fitted the SVR model and have X_poly and X_test ready
y_pred = regressor.predict(poly_reg.transform(X_test))

# Set precision for printing
np.set_printoptions(precision=2)

# Concatenate the predicted and actual values
output = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis=1)

# Format and print the output
formatted_output = [[f'{pred:.2f}' for pred in row] for row in output]

for row in formatted_output:
    print(row)


Calculating Different **Accuracy** Measures

In [35]:
# Assuming y_test contains the actual values and y_pred contains the predicted values
mae = mean_absolute_error(y_test, y_pred[:len(y_test)])
mse = mean_squared_error(y_test, y_pred[:len(y_test)])
rmse = mean_squared_error(y_test, y_pred[:len(y_test)], squared=False)
r2 = r2_score(y_test, y_pred[:len(y_test)])

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("The r-squared score (R^2):", r2)


degree = 2 :

Mean Absolute Error (MAE): **13.503102616211248**

Mean Squared Error (MSE): **444.46300908730535**

Root Mean Squared Error (RMSE): **21.08229136235683**

The r-squared score (R^2): **0.7107181316046094**


degree = 3:

Mean Absolute Error (MAE): 16141112191.574871

Mean Squared Error (MSE): 7.515508528871861e+21

Root Mean Squared Error (RMSE): 86692032672.3965

The r-squared score (R^2): -4.891521464605143e+18


degree=4: 

Mean Absolute Error (MAE): 43.748324252591274

Mean Squared Error (MSE): 2943.3954588674023

Root Mean Squared Error (RMSE): 54.253068658532136

The r-squared score (R^2): -0.9157295890970683

## Support Vector Regression

In [36]:
X

array([[ 0. ,  0. ,  0. , ...,  1. ,  0. , 10. ],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  9.5],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. , 11. ],
       ...,
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  7.5],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  8. ],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  7.5]])

In [37]:
y

array([170., 180.,  75., ...,  70.,  55., 170.])

Reshaping the **Target Variable** column

In [38]:
y = y.reshape(len(y), 1)
y

array([[170.],
       [180.],
       [ 75.],
       ...,
       [ 70.],
       [ 55.],
       [170.]])

Splitting the Dataset into a **Training** and **Test** Set

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

**Scaling** X_train and y_train

In [40]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)

**Training** the Model

In [41]:
from sklearn.svm import SVR

regressor = SVR(kernel='rbf')
regressor.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVR()

**Predicting** Test Set Results

In [42]:
y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X_test)).reshape(-1,1))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)),1))

[[103.8  100.  ]
 [150.2  220.  ]
 [ 85.22  80.  ]
 [156.73 160.  ]
 [ 99.01 110.  ]
 [ 91.45  90.  ]
 [157.07 120.  ]
 [ 60.89  75.  ]
 [131.02 130.  ]
 [134.43 140.  ]
 [127.19 160.  ]
 [ 98.    90.  ]
 [ 70.2   70.  ]
 [124.82 120.  ]
 [ 79.43  60.  ]
 [ 52.75  55.  ]
 [ 86.76  80.  ]
 [ 73.31  60.  ]
 [ 92.85 110.  ]
 [126.44 120.  ]
 [133.87 170.  ]
 [ 62.7   65.  ]
 [ 69.6   70.  ]
 [101.59  90.  ]
 [126.25 160.  ]
 [ 71.13  75.  ]
 [ 67.86  70.  ]
 [ 90.62  80.  ]
 [ 89.03  85.  ]
 [175.23 180.  ]
 [157.51 160.  ]
 [155.21 120.  ]
 [ 97.12 100.  ]
 [146.29 130.  ]
 [118.64 130.  ]
 [ 68.99  65.  ]
 [125.98 160.  ]
 [175.91 180.  ]
 [125.98 150.  ]
 [103.8   90.  ]
 [ 65.44  70.  ]
 [125.48 160.  ]
 [114.88 120.  ]
 [122.42 130.  ]
 [ 69.16  65.  ]
 [134.43 130.  ]
 [ 71.02  65.  ]
 [ 86.76  55.  ]
 [ 84.6   85.  ]
 [ 72.37  70.  ]
 [120.63 175.  ]
 [126.96 120.  ]
 [ 79.48  60.  ]
 [129.44 170.  ]
 [ 86.07  90.  ]
 [ 98.53  90.  ]
 [ 84.2   85.  ]
 [126.23  85.  ]
 [ 99.36  85. 

Calculating Different **Accuracy** Measures

In [43]:
# Assuming y_test contains the actual values and y_pred contains the predicted values
mae = mean_absolute_error(y_test, y_pred[:len(y_test)])
mse = mean_squared_error(y_test, y_pred[:len(y_test)])
rmse = mean_squared_error(y_test, y_pred[:len(y_test)], squared=False)
r2 = r2_score(y_test, y_pred[:len(y_test)])

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("The r-squared score (R^2):", r2)


Mean Absolute Error (MAE): 12.126942277443707
Mean Squared Error (MSE): 337.0832364045526
Root Mean Squared Error (RMSE): 18.359826698652487
The r-squared score (R^2): 0.7759174402010507


**Color** not included:

Mean Absolute Error (MAE): 12.576106919460328

Mean Squared Error (MSE): 393.2635605348159

Root Mean Squared Error (RMSE): 19.83087392261914

The r-squared score (R^2): 0.7385704900064249

**Color** included:

Mean Absolute Error (MAE): **12.126942277443707**

Mean Squared Error (MSE): **337.0832364045526**

Root Mean Squared Error (RMSE): **18.359826698652487**

The r-squared score (R^2): **0.7759174402010507**

## Decision Tree 

In [44]:
X

array([[ 0. ,  0. ,  0. , ...,  1. ,  0. , 10. ],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  9.5],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. , 11. ],
       ...,
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  7.5],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  8. ],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  7.5]])

In [45]:
y

array([[170.],
       [180.],
       [ 75.],
       ...,
       [ 70.],
       [ 55.],
       [170.]])

Splitting the dataset into a **Training** and **Test** Set

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

**Training** the Model

In [47]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)

**Predicting** Test Set Results

In [48]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)),1))


[[120.   100.  ]
 [220.   220.  ]
 [ 80.    80.  ]
 [160.   160.  ]
 [ 80.   110.  ]
 [ 90.    90.  ]
 [150.   120.  ]
 [ 50.    75.  ]
 [130.   130.  ]
 [136.67 140.  ]
 [120.   160.  ]
 [100.    90.  ]
 [ 70.    70.  ]
 [160.   120.  ]
 [ 90.    60.  ]
 [ 50.    55.  ]
 [ 80.    80.  ]
 [ 62.5   60.  ]
 [110.   110.  ]
 [180.   120.  ]
 [120.   170.  ]
 [ 59.    65.  ]
 [ 70.    70.  ]
 [100.    90.  ]
 [120.   160.  ]
 [ 75.    75.  ]
 [ 70.    70.  ]
 [ 85.    80.  ]
 [ 85.    85.  ]
 [180.   180.  ]
 [160.   160.  ]
 [160.   120.  ]
 [ 80.   100.  ]
 [160.   130.  ]
 [130.   130.  ]
 [ 70.    65.  ]
 [155.   160.  ]
 [180.   180.  ]
 [155.   150.  ]
 [120.    90.  ]
 [ 70.    70.  ]
 [120.   160.  ]
 [ 90.   120.  ]
 [130.   130.  ]
 [ 70.    65.  ]
 [136.67 130.  ]
 [ 65.    65.  ]
 [ 95.    55.  ]
 [ 82.5   85.  ]
 [ 75.    70.  ]
 [175.   175.  ]
 [ 80.   120.  ]
 [ 60.    60.  ]
 [150.   170.  ]
 [ 90.    90.  ]
 [ 90.    90.  ]
 [ 82.5   85.  ]
 [145.    85.  ]
 [ 65.    85. 

Calculating Different **Accuracy** Measures

In [49]:
# Assuming y_test contains the actual values and y_pred contains the predicted values
mae = mean_absolute_error(y_test, y_pred[:len(y_test)])
mse = mean_squared_error(y_test, y_pred[:len(y_test)])
rmse = mean_squared_error(y_test, y_pred[:len(y_test)], squared=False)
r2 = r2_score(y_test, y_pred[:len(y_test)])

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("The r-squared score (R^2):", r2)


Mean Absolute Error (MAE): 12.952263083451204
Mean Squared Error (MSE): 555.1038060438698
Root Mean Squared Error (RMSE): 23.560641036352763
The r-squared score (R^2): 0.6309840763983783


**Color Not Included**:

Mean Absolute Error (MAE): **12.114997642621406**

Mean Squared Error (MSE): **448.09967549448817**

Root Mean Squared Error (RMSE): **21.16836496979604**

The r-squared score (R^2): **0.7021171286922909**


**Color Included**:

Mean Absolute Error (MAE): 12.952263083451204

Mean Squared Error (MSE): 555.1038060438698

Root Mean Squared Error (RMSE): 23.560641036352763

The r-squared score (R^2): 0.6309840763983783

## Random Forest

In [211]:
X

array([[ 0. ,  0. ,  0. , ...,  1. ,  0. , 10. ],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  9.5],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. , 11. ],
       ...,
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  7.5],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  8. ],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  7.5]])

In [212]:
y

array([[170.],
       [180.],
       [ 75.],
       ...,
       [ 70.],
       [ 55.],
       [170.]])

Splitting the dataset into a **Training** and **Test** Set

In [219]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

**Training** the Model

In [220]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=10, random_state=0)
regressor.fit(X_train, y_train)

  regressor.fit(X_train, y_train)


RandomForestRegressor(n_estimators=10, random_state=0)

**Predicting** the Test Set Values

In [221]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)),1))


[[116.33 100.  ]
 [171.25 220.  ]
 [ 72.5   80.  ]
 [159.   160.  ]
 [ 91.   110.  ]
 [ 93.    90.  ]
 [152.   120.  ]
 [ 60.5   75.  ]
 [132.5  130.  ]
 [134.5  140.  ]
 [126.5  160.  ]
 [101.    90.  ]
 [ 69.5   70.  ]
 [152.   120.  ]
 [ 82.08  60.  ]
 [ 50.    55.  ]
 [ 82.17  80.  ]
 [ 64.9   60.  ]
 [ 90.   110.  ]
 [167.25 120.  ]
 [122.   170.  ]
 [ 59.1   65.  ]
 [ 70.    70.  ]
 [ 98.    90.  ]
 [109.33 160.  ]
 [ 70.    75.  ]
 [ 69.05  70.  ]
 [ 84.    80.  ]
 [ 79.    85.  ]
 [170.   180.  ]
 [160.   160.  ]
 [146.   120.  ]
 [ 85.   100.  ]
 [147.   130.  ]
 [133.   130.  ]
 [ 71.61  65.  ]
 [153.75 160.  ]
 [180.   180.  ]
 [153.75 150.  ]
 [116.33  90.  ]
 [ 68.67  70.  ]
 [128.   160.  ]
 [111.   120.  ]
 [130.   130.  ]
 [ 67.    65.  ]
 [134.5  130.  ]
 [ 67.    65.  ]
 [ 81.5   55.  ]
 [ 80.92  85.  ]
 [ 71.    70.  ]
 [172.5  175.  ]
 [110.   120.  ]
 [ 66.    60.  ]
 [157.   170.  ]
 [ 90.    90.  ]
 [ 94.    90.  ]
 [ 59.67  85.  ]
 [144.5   85.  ]
 [ 70.    85. 

Calculating Different **Accuracy** Measures

In [222]:

# Assuming y_test contains the actual values and y_pred contains the predicted values
mae = mean_absolute_error(y_test, y_pred[:len(y_test)])
mse = mean_squared_error(y_test, y_pred[:len(y_test)])
rmse = mean_squared_error(y_test, y_pred[:len(y_test)], squared=False)
r2 = r2_score(y_test, y_pred[:len(y_test)])



print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("The r-squared score (R^2):", r2)


Mean Absolute Error (MAE): 11.47055076221908
Mean Squared Error (MSE): 363.41243650037916
Root Mean Squared Error (RMSE): 19.06337946168987
The r-squared score (R^2): 0.7584145984167422


**Color Included**:

Mean Absolute Error (MAE): **11.47055076221908**

Mean Squared Error (MSE): **363.41243650037916**

Root Mean Squared Error (RMSE): **19.06337946168987**

The r-squared score (R^2): **0.7584145984167422**


**Color not Included**:

Mean Absolute Error (MAE): 11.883598423829131

Mean Squared Error (MSE): 377.250178795279

Root Mean Squared Error (RMSE): 19.422929202241328

The r-squared score (R^2): 0.7492156932787901

## Random Forest Model with Tuned Hyperparamters

In [216]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=140, random_state=0, max_depth=14, min_samples_split=11)
regressor.fit(X_train, y_train)

  regressor.fit(X_train, y_train)


RandomForestRegressor(max_depth=14, min_samples_split=11, n_estimators=140,
                      random_state=0)

In [217]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)),1))


[[105.34 100.  ]
 [191.87 220.  ]
 [ 73.49  80.  ]
 [157.99 160.  ]
 [ 90.06 110.  ]
 [ 92.5   90.  ]
 [157.24 120.  ]
 [ 64.12  75.  ]
 [133.12 130.  ]
 [135.01 140.  ]
 [137.02 160.  ]
 [ 96.55  90.  ]
 [ 68.83  70.  ]
 [126.63 120.  ]
 [ 80.64  60.  ]
 [ 50.61  55.  ]
 [ 83.56  80.  ]
 [ 66.84  60.  ]
 [ 93.41 110.  ]
 [147.9  120.  ]
 [126.96 170.  ]
 [ 64.82  65.  ]
 [ 69.27  70.  ]
 [ 95.79  90.  ]
 [122.54 160.  ]
 [ 70.39  75.  ]
 [ 68.73  70.  ]
 [ 84.39  80.  ]
 [ 75.05  85.  ]
 [176.03 180.  ]
 [155.48 160.  ]
 [159.01 120.  ]
 [ 85.94 100.  ]
 [144.11 130.  ]
 [130.43 130.  ]
 [ 74.05  65.  ]
 [131.98 160.  ]
 [178.43 180.  ]
 [131.98 150.  ]
 [105.34  90.  ]
 [ 69.57  70.  ]
 [131.3  160.  ]
 [107.08 120.  ]
 [129.61 130.  ]
 [ 69.43  65.  ]
 [135.01 130.  ]
 [ 70.13  65.  ]
 [ 80.58  55.  ]
 [ 83.54  85.  ]
 [ 66.89  70.  ]
 [169.26 175.  ]
 [111.86 120.  ]
 [ 78.76  60.  ]
 [160.22 170.  ]
 [ 90.51  90.  ]
 [ 90.78  90.  ]
 [ 77.83  85.  ]
 [139.73  85.  ]
 [ 69.86  85. 

In [218]:

# Assuming y_test contains the actual values and y_pred contains the predicted values
mae = mean_absolute_error(y_test, y_pred[:len(y_test)])
mse = mean_squared_error(y_test, y_pred[:len(y_test)])
rmse = mean_squared_error(y_test, y_pred[:len(y_test)], squared=False)
r2 = r2_score(y_test, y_pred[:len(y_test)])



print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("The r-squared score (R^2):", r2)


Mean Absolute Error (MAE): 10.74473529082464
Mean Squared Error (MSE): 275.18088474111363
Root Mean Squared Error (RMSE): 16.588576935382783
The r-squared score (R^2): 0.8170682181699394


**Cross-Validation** to find the optimal values for each hyperparameter for **Random Forest**

In [206]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and their respective values to be tested
hyperparameters = {
    'n_estimators': np.arange(10,150,10),
    'max_depth': np.arange(1, 25, 1),
    'min_samples_split': np.arange(1, 15, 1),
    'min_samples_leaf': np.arange(25,300,25)
}

# Create the SVR model
regressor = RandomForestRegressor()

# Perform grid search with cross-validation
grid_search = GridSearchCV(regressor, hyperparameters, cv=50)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model
best_params = grid_search.best_params_
best_regressor = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Evaluate the model on the test set
y_pred = best_regressor.predict(X_test)


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

Best Hyperparameters: {'min_samples_split': 11}


## Support Vector Regression with Tuned Hyperparameters

**Grid Search Cross-Validation** to find the optimal **hyperparameter** values

In [145]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and their respective values to be tested
hyperparameters = {
    'C': np.arange(1,15,1),
    'epsilon': np.arange(0.15, 0.25,0.01),
    'tol': np.arange(0.001, 0.025, 0.001),
    'max_iter': np.arange(25,300,25)
}

# Create the SVR model
regressor = SVR(kernel='rbf')

# Perform grid search with cross-validation
grid_search = GridSearchCV(regressor, hyperparameters, cv=50)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model
best_params = grid_search.best_params_
best_regressor = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Evaluate the model on the test set
y_pred = best_regressor.predict(X_test)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best Hyperparameters: {'tol': 0.022000000000000002}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [197]:
X

array([[ 0. ,  0. ,  0. , ...,  1. ,  0. , 10. ],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  9.5],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. , 11. ],
       ...,
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  7.5],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  8. ],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  7.5]])

In [198]:
y

array([[170.],
       [180.],
       [ 75.],
       ...,
       [ 70.],
       [ 55.],
       [170.]])

If the above cell prints out the **y** values **horizontally**, run the cell below

In [59]:
y = y.reshape(len(y), 1)
y

array([[170.],
       [180.],
       [ 75.],
       ...,
       [ 70.],
       [ 55.],
       [170.]])

Splitting the dataset into **Training** and **Test** Sets

In [199]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

**Scaling** X_train and y_test

In [200]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)

**Training** the Model

In [201]:
from sklearn.svm import SVR

regressor = SVR(kernel='rbf', C=4, epsilon=0.17000000000000004, tol=0.024)
regressor.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=4, epsilon=0.17000000000000004, tol=0.024)

**Predicting** y_test values

In [202]:
y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X_test)).reshape(-1,1))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)),1))

[[106.43 100.  ]
 [153.9  220.  ]
 [ 89.19  80.  ]
 [154.17 160.  ]
 [106.51 110.  ]
 [ 96.78  90.  ]
 [161.11 120.  ]
 [ 62.63  75.  ]
 [131.9  130.  ]
 [133.44 140.  ]
 [129.36 160.  ]
 [ 99.67  90.  ]
 [ 73.01  70.  ]
 [127.78 120.  ]
 [ 76.72  60.  ]
 [ 56.47  55.  ]
 [ 88.24  80.  ]
 [ 75.1   60.  ]
 [ 97.83 110.  ]
 [128.31 120.  ]
 [136.   170.  ]
 [ 65.59  65.  ]
 [ 70.56  70.  ]
 [100.45  90.  ]
 [128.52 160.  ]
 [ 70.54  75.  ]
 [ 68.29  70.  ]
 [ 91.84  80.  ]
 [ 91.71  85.  ]
 [172.52 180.  ]
 [155.17 160.  ]
 [153.7  120.  ]
 [102.91 100.  ]
 [143.57 130.  ]
 [112.36 130.  ]
 [ 71.68  65.  ]
 [127.73 160.  ]
 [173.85 180.  ]
 [127.73 150.  ]
 [106.43  90.  ]
 [ 65.1   70.  ]
 [127.11 160.  ]
 [ 88.17 120.  ]
 [124.   130.  ]
 [ 68.23  65.  ]
 [133.44 130.  ]
 [ 70.22  65.  ]
 [ 88.38  55.  ]
 [ 85.09  85.  ]
 [ 69.59  70.  ]
 [151.95 175.  ]
 [127.25 120.  ]
 [ 76.13  60.  ]
 [140.01 170.  ]
 [ 92.9   90.  ]
 [105.1   90.  ]
 [ 80.97  85.  ]
 [129.79  85.  ]
 [106.95  85. 

In [203]:
# Assuming y_test contains the actual values and y_pred contains the predicted values
mae = mean_absolute_error(y_test, y_pred[:len(y_test)])
mse = mean_squared_error(y_test, y_pred[:len(y_test)])
rmse = mean_squared_error(y_test, y_pred[:len(y_test)], squared=False)
r2 = r2_score(y_test, y_pred[:len(y_test)])
mpe = np.mean((y_test-y_pred) / y_test) * 100


print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("The r-squared score (R^2):", r2)
print(f"Mean Percentage Error (MPE): {mpe:.2f}%")


Mean Absolute Error (MAE): 11.86246229300697
Mean Squared Error (MSE): 281.53772047004645
Root Mean Squared Error (RMSE): 16.7790858055511
The r-squared score (R^2): 0.8128423894471751
Mean Percentage Error (MPE): -2.52%
