### Shoe Size Prediction
### This notebook is design to use machine learning to predict the shoe size of individual using their Height and Weight 

### Prepare the tools

In [40]:
# Regular EDA (exploratory data analysis) and plotting libraries 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline 

# Models for Scikit-learn 
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR 
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Model Evaluation 
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import RocCurveDisplay

In [13]:
# inport the training and validation sets 
df = pd.read_csv("data/shoe size.csv")
df.head()

Unnamed: 0,Hight,Weight,Shoe size,Gender
0,180,79.0,42,M
1,165,65.0,41,M
2,178,72.0,42,M
3,160,53.0,43,M
4,182,78.0,36,M


In [14]:
df.shape

(109, 4)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Hight      109 non-null    int64  
 1   Weight     109 non-null    float64
 2   Shoe size  109 non-null    int64  
 3   Gender     109 non-null    object 
dtypes: float64(1), int64(2), object(1)
memory usage: 3.5+ KB


In [16]:
df.describe()

Unnamed: 0,Hight,Weight,Shoe size
count,109.0,109.0,109.0
mean,173.40367,95.887156,40.743119
std,13.581114,30.423234,1.771187
min,140.0,49.0,36.0
25%,164.0,68.0,39.0
50%,174.0,95.0,41.0
75%,185.0,118.0,42.0
max,197.0,159.0,44.0


In [17]:
df.isna().sum()

Hight        0
Weight       0
Shoe size    0
Gender       0
dtype: int64

In [18]:
df.Gender.value_counts()

Gender
M    60
F    49
Name: count, dtype: int64

In [20]:
X = df.drop("Shoe size", axis=1)
y = df["Shoe size"]

In [21]:
y.head()

0    42
1    41
2    42
3    43
4    36
Name: Shoe size, dtype: int64

In [22]:
X.head()

Unnamed: 0,Hight,Weight,Gender
0,180,79.0,M
1,165,65.0,M
2,178,72.0,M
3,160,53.0,M
4,182,78.0,M


In [30]:
X.shape

(109, 3)

In [24]:
# split data into train and test set 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)

In [31]:
X_train.shape

(87, 3)

In [32]:
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 

categorical_feature = ["Gender"] 
one_hot = OneHotEncoder() 
transformer = ColumnTransformer([("one_hot", 
                                  one_hot, 
                                 categorical_feature)], 
                               remainder="passthrough") 
X_train_trans = transformer.fit_transform(X_train)
X_test_trans = transformer.transform(X_test)

In [34]:
X_test_trans

array([[  0.,   1., 172.,  92.],
       [  0.,   1., 186., 118.],
       [  0.,   1., 163., 159.],
       [  0.,   1., 169., 103.],
       [  0.,   1., 165.,  65.],
       [  1.,   0., 175., 120.],
       [  0.,   1., 190.,  95.],
       [  0.,   1., 184.,  66.],
       [  1.,   0., 168.,  68.],
       [  0.,   1., 190.,  83.],
       [  1.,   0., 165.,  59.],
       [  1.,   0., 195., 104.],
       [  0.,   1., 193.,  54.],
       [  1.,   0., 159.,  80.],
       [  0.,   1., 172.,  55.],
       [  1.,   0., 188., 122.],
       [  0.,   1., 178.,  52.],
       [  0.,   1., 187.,  62.],
       [  0.,   1., 164.,  75.],
       [  0.,   1., 181., 111.],
       [  1.,   0., 149., 108.],
       [  0.,   1., 172., 139.]])

In [37]:
# Put model into dictionary 
np.random.seed(42)
models = {"RandomForest": RandomForestRegressor(),
         "LinearRegression": LinearRegression(), 
         "KNN": KNeighborsRegressor(),
         "GBR": GradientBoostingRegressor(), 
         "SVR": SVR()}

In [41]:
# Create a function to fit and score model 
def fit_and_score(models, X_train, y_train, X_test, y_test):
    # Create an empty dictionary
    model_scores = {}

    # create a random seed 
    np.random.seed(42) 

    # fit and evaluate the models 
    for model_name, model in models.items(): 
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Create an accuracy 
        r2 = model.score(X_test, y_test)
        model_scores[model_name] = r2 
    
        print(f"Model: {model_name}")
        print(f"Cofficient of Determination: {r2:.2f}")
        print("Mean squared Error:")
        print(mean_squared_error(y_test, y_pred))
        print("Root Mean Squared Error:")
        print(root_mean_squared_error(y_test, y_pred))
        print("\n")

    return model_scores

model_scores = fit_and_score(models = models, 
                            X_train = X_train_trans, 
                            y_train = y_train, 
                            X_test = X_test_trans, 
                            y_test = y_test)

Model: RandomForest
Cofficient of Determination: 0.29
Mean squared Error:
1.392881238636364
Root Mean Squared Error:
1.180203897060319


Model: LinearRegression
Cofficient of Determination: 0.31
Mean squared Error:
1.3534638716854637
Root Mean Squared Error:
1.1633846619607222


Model: KNN
Cofficient of Determination: 0.25
Mean squared Error:
1.4745454545454557
Root Mean Squared Error:
1.2143086323276533


Model: GBR
Cofficient of Determination: -0.10
Mean squared Error:
2.149117367951472
Root Mean Squared Error:
1.4659868239351512


Model: SVR
Cofficient of Determination: 0.23
Mean squared Error:
1.499628851153851
Root Mean Squared Error:
1.2245933411356813




### Cross Validation the dataset

In [45]:
# Cross-validation settings
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold cross-validation

# Metrics
scoring = {
    "R2": make_scorer(r2_score),
    "MSE": make_scorer(mean_squared_error)
}

# Perform cross-validation
results = {}
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    cv_r2 = cross_val_score(model, X_train_trans, y_train, cv=kf, scoring="r2")
    cv_mse = -cross_val_score(model, X_train_trans, y_train, cv=kf, scoring="neg_mean_squared_error")
    results[model_name] = {
        "R2 Mean": np.mean(cv_r2),
        "MSE Mean": np.mean(cv_mse),
        "R2 Std": np.std(cv_r2),
        "MSE Std": np.std(cv_mse)
    }

# Display results
for model_name, metrics in results.items():
    print(f"\nModel: {model_name}")
    print(f"R2 Mean: {metrics['R2 Mean']:.2f} ± {metrics['R2 Std']:.2f}")
    print(f"MSE Mean: {metrics['MSE Mean']:.2f} ± {metrics['MSE Std']:.2f}")


Evaluating RandomForest...
Evaluating LinearRegression...
Evaluating KNN...
Evaluating GBR...
Evaluating SVR...

Model: RandomForest
R2 Mean: 0.17 ± 0.23
MSE Mean: 2.71 ± 0.82

Model: LinearRegression
R2 Mean: 0.27 ± 0.16
MSE Mean: 2.35 ± 0.72

Model: KNN
R2 Mean: -0.01 ± 0.20
MSE Mean: 3.18 ± 0.58

Model: GBR
R2 Mean: 0.05 ± 0.22
MSE Mean: 3.01 ± 0.59

Model: SVR
R2 Mean: 0.17 ± 0.09
MSE Mean: 2.78 ± 1.11
