# Feature Engineering

### Importing libraries

In [25]:
import numpy as np
import cv2
import matplotlib
from matplotlib import pyplot as plt
import os
import pywt
import pandas as pd
%matplotlib inline


# importing libraries required for model building and 

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

# importing a library for visualization

import seaborn as sn

# importing libraries to save the ML model and create JSON files

import joblib 
import json


### Creating a list of all folder names

In [2]:
def list_folders(directory):
    folders = []
    for entry in os.listdir(directory):
        full_path = os.path.join(directory, entry)
        if os.path.isdir(full_path):
            folders.append(entry)
    return folders

# Specify the directory you want to list folders from
directory_path = 'cropped_images/'

# Get the list of folder names
folder_names = list_folders(directory_path)

# folder_names

### Creating a dictionary of player name with all the image paths

In [3]:

# Creating a list of folder names of cropped images

cropped_folder_list = folder_names

folder_path_dict = {'Name':[],
                    'Paths':[]}



for folder_name in cropped_folder_list:
    
    list_1 = []

    folder_path = 'cropped_images'+'/{}'.format(folder_name)

    # Get a list of all files and directories in the specified folder
    files_and_directories = os.listdir(folder_path)

    # Filter only files (not directories) if needed
    files = [f for f in files_and_directories if os.path.isfile(os.path.join(folder_path, f))]

    # Now 'files' contains a list of file names in the folder
    # You can also get the full paths by using os.path.join() on each file name
    file_paths = [os.path.join(folder_path, f) for f in files]
    
    # changing the folder name into a proper name to be used as the dictionary key
    
    original_string = folder_name

    # Split the string based on underscores
    parts = original_string.split('_')

    # Join the first two parts with spaces
    clean_name = ' '.join(parts[:2])


    # Print the file paths
    for file_path in file_paths:
        list_1.append(file_path)
    
    folder_path_dict['Name'].append(clean_name)
    folder_path_dict['Paths'].append(list_1)
    
# folder_path_dict

In [4]:
# check

folder_path_dict['Name']

['.ipynb checkpoints',
 'alex albon',
 'carlos sainz',
 'charles leclerc',
 'daniel ricciardo',
 'esteban ocon',
 'fernando alonso',
 'george russell',
 'kevin magnussen',
 'lance stroll',
 'lando norris',
 'lewis hamilton',
 'logan sargeant',
 'max verstappen',
 'nico hulkenberg',
 'oscar piastri',
 'pierre gasly',
 'sergio perez',
 'valtteri bottas',
 'yuki tsunoda',
 'zhou guanyu']

In [5]:
# folder_path_dict['Paths']

### Converting racer names into numeric classes

In [6]:
class_dict = {}

count = 0
for racer_name in folder_path_dict['Name']:
    class_dict[racer_name] = count
    count = count + 1
    
class_dict

{'.ipynb checkpoints': 0,
 'alex albon': 1,
 'carlos sainz': 2,
 'charles leclerc': 3,
 'daniel ricciardo': 4,
 'esteban ocon': 5,
 'fernando alonso': 6,
 'george russell': 7,
 'kevin magnussen': 8,
 'lance stroll': 9,
 'lando norris': 10,
 'lewis hamilton': 11,
 'logan sargeant': 12,
 'max verstappen': 13,
 'nico hulkenberg': 14,
 'oscar piastri': 15,
 'pierre gasly': 16,
 'sergio perez': 17,
 'valtteri bottas': 18,
 'yuki tsunoda': 19,
 'zhou guanyu': 20}

### Creating a function for wavelet transforming

In [7]:
def w2d(img, mode='haar', level=1):
    imArray = img
    #Datatype conversions
    #convert to grayscale
    imArray = cv2.cvtColor( imArray,cv2.COLOR_RGB2GRAY )
    #convert to float
    imArray =  np.float32(imArray)   
    imArray /= 255;
    # compute coefficients 
    coeffs=pywt.wavedec2(imArray, mode, level=level)

    #Process Coefficients
    coeffs_H=list(coeffs)  
    coeffs_H[0] *= 0;  

    # reconstruction
    imArray_H=pywt.waverec2(coeffs_H, mode);
    imArray_H *= 255;
    imArray_H =  np.uint8(imArray_H)

    return imArray_H


# Model Improvement Section

- Raw image resized to 32x32 (m_1)
- Raw image resized to 64x64 (m_2)
- Histogram equalized image at 64x64 (m_3)
- Raw image resized at 64x64 + Histogram equalized image at 64x64 (vertically stacked) (m_4)

### Creating the X and y variables for model training

In [8]:

X, y = [], []
for racer_name, training_files in zip(folder_path_dict['Name'],folder_path_dict['Paths']):
    for training_image in training_files: 
        # print(racer_name)
        # print(training_image)
        img = cv2.imread(training_image)
        if img is None:
            continue
        # m_1
        scaled_raw_img = cv2.resize(img,(32,32))
        
        # m_2
        # scaled_raw_img = cv2.resize(img,(64,64))
        
        # m_3
        # img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        # equ = cv2.equalizeHist(img_gray)
        # scaled_img_hist = cv2.resize(equ, (64, 64))
        
        # m_4
        # combined_img = np.vstack((scaled_raw_img.reshape(64*64*3,1),scaled_img_hist.reshape(64*64,1)))
        
        
        # reshaped final image
        
        # m_1
        final_img = scaled_raw_img.reshape(32*32*3,1)
        
        # m_2
        # final_img = scaled_raw_img.reshape(64*64*3,1) 
        
         # m_3
        # final_img = scaled_img_hist.reshape(64*64,1)
        
        # m_4
        
        # final_img = combined_img.reshape(64*64*4,1)
        
        
        X.append(final_img)
        y.append(class_dict[racer_name]) 
        

### Reshaping X and also updating the numbers to be as float

In [10]:
X = np.array(X).reshape(len(X),3072).astype(float)
X.shape

(1895, 3072)

### <font color=yellow> Explain the shape of the model input (X) and y </font>

## 1. Raw image resized to 32x32

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

pipe_1 = Pipeline([('scaler', StandardScaler()), # Step 1: Standardize the data
                 ('svc', SVC(kernel = 'rbf', C = 10)) # Step 2: Train a model
                ])

pipe_1.fit(X_train, y_train)
m_1 = pipe_1.score(X_test, y_test)
print(m_1)

0.5759493670886076


## 2. Raw image resized to 64x64 (m_2)

In [13]:
X, y = [], []
for racer_name, training_files in zip(folder_path_dict['Name'],folder_path_dict['Paths']):
    for training_image in training_files: 
        # print(racer_name)
        # print(training_image)
        img = cv2.imread(training_image)
        if img is None:
            continue
        # m_1
        # scaled_raw_img = cv2.resize(img,(32,32))
        
        # m_2
        scaled_raw_img = cv2.resize(img,(64,64))
        
        # m_3
        # img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        # equ = cv2.equalizeHist(img_gray)
        # scaled_img_hist = cv2.resize(equ, (64, 64))
        
        # m_4
        # combined_img = np.vstack((scaled_raw_img.reshape(64*64*3,1),scaled_img_hist.reshape(64*64,1)))
        
        
        # reshaped final image
        
        # m_1
        # final_img = scaled_raw_img.reshape(32*32*3,1)
        
        # m_2
        final_img = scaled_raw_img.reshape(64*64*3,1) 
        
         # m_3
        # final_img = scaled_img_hist.reshape(64*64,1)
        
        # m_4
        
        # final_img = combined_img.reshape(64*64*4,1)
        
        
        X.append(final_img)
        y.append(class_dict[racer_name]) 
        

In [15]:
X = np.array(X).reshape(len(X),12288).astype(float)
X.shape

(1895, 12288)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

pipe_2 = Pipeline([('scaler', StandardScaler()), # Step 1: Standardize the data
                 ('svc', SVC(kernel = 'rbf', C = 10)) # Step 2: Train a model
                ])

pipe_2.fit(X_train, y_train)
m_2 = pipe_2.score(X_test, y_test)
print(m_2)

0.5780590717299579


## 3. Histogram equalized image at 64x64 (m_3)

In [17]:
X, y = [], []
for racer_name, training_files in zip(folder_path_dict['Name'],folder_path_dict['Paths']):
    for training_image in training_files: 
        # print(racer_name)
        # print(training_image)
        img = cv2.imread(training_image)
        if img is None:
            continue
        # m_1
        # scaled_raw_img = cv2.resize(img,(32,32))
        
        # m_2
        # scaled_raw_img = cv2.resize(img,(64,64))
        
        # m_3
        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        equ = cv2.equalizeHist(img_gray)
        scaled_img_hist = cv2.resize(equ, (64, 64))
        
        # m_4
        # combined_img = np.vstack((scaled_raw_img.reshape(64*64*3,1),scaled_img_hist.reshape(64*64,1)))
        
        
        # reshaped final image
        
        # m_1
        # final_img = scaled_raw_img.reshape(32*32*3,1)
        
        # m_2
        # final_img = scaled_raw_img.reshape(64*64*3,1) 
        
         # m_3
        final_img = scaled_img_hist.reshape(64*64,1)
        
        # m_4
        
        # final_img = combined_img.reshape(64*64*4,1)
        
        
        X.append(final_img)
        y.append(class_dict[racer_name]) 

In [18]:
X = np.array(X).reshape(len(X),4096).astype(float)
X.shape

(1895, 4096)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

pipe_3 = Pipeline([('scaler', StandardScaler()), # Step 1: Standardize the data
                 ('svc', SVC(kernel = 'rbf', C = 10)) # Step 2: Train a model
                ])

pipe_3.fit(X_train, y_train)
m_3 = pipe_3.score(X_test, y_test)
print(m_3)

0.5864978902953587


## 4. Raw image resized at 64x64 + Histogram equalized image at 64x64 (vertically stacked) (m_4)

In [21]:
X, y = [], []
for racer_name, training_files in zip(folder_path_dict['Name'],folder_path_dict['Paths']):
    for training_image in training_files: 
        # print(racer_name)
        # print(training_image)
        img = cv2.imread(training_image)
        if img is None:
            continue
        # m_1
        # scaled_raw_img = cv2.resize(img,(32,32))
        
        # m_2
        # scaled_raw_img = cv2.resize(img,(64,64))
        
        # m_3
        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        equ = cv2.equalizeHist(img_gray)
        scaled_img_hist = cv2.resize(equ, (64, 64))
        
        # m_4
        combined_img = np.vstack((scaled_raw_img.reshape(64*64*3,1),scaled_img_hist.reshape(64*64,1)))
        
        
        # reshaped final image
        
        # m_1
        # final_img = scaled_raw_img.reshape(32*32*3,1)
        
        # m_2
        # final_img = scaled_raw_img.reshape(64*64*3,1) 
        
         # m_3
        # final_img = scaled_img_hist.reshape(64*64,1)
        
        # m_4
        
        final_img = combined_img.reshape(64*64*4,1)
        
        
        X.append(final_img)
        y.append(class_dict[racer_name]) 

In [22]:
X = np.array(X).reshape(len(X),16384).astype(float)
X.shape

(1895, 16384)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

pipe_4 = Pipeline([('scaler', StandardScaler()), # Step 1: Standardize the data
                 ('svc', SVC(kernel = 'rbf', C = 10)) # Step 2: Train a model
                ])

pipe_4.fit(X_train, y_train)
m_4 = pipe_4.score(X_test, y_test)

In [26]:
## Creating a summary table of all the methods used and their performance

methods = ["m_1", "m_2", "m_3","m_4"]
prediction_scores = [m_1, m_2, m_3, m_4]

# Create a DataFrame
df = pd.DataFrame({"Method": methods, "Prediction Scores": prediction_scores})

# Display the DataFrame
print(df)

  Method  Prediction Scores
0    m_1           0.575949
1    m_2           0.578059
2    m_3           0.586498
3    m_4           0.586498


## Create a classification report that provides precision, recall, and F1-score

In [27]:
print(classification_report(y_test, pipe_4.predict(X_test)))

              precision    recall  f1-score   support

           1       0.57      0.63      0.60        38
           2       0.65      0.57      0.61        30
           3       0.62      0.62      0.62        29
           4       0.67      0.53      0.59        34
           5       0.67      0.58      0.62        24
           6       0.50      0.38      0.43        24
           7       0.50      0.56      0.53        25
           8       0.58      0.78      0.67        18
           9       0.44      0.74      0.55        23
          10       0.59      0.61      0.60        31
          11       0.45      0.56      0.50        18
          12       0.71      0.50      0.59        30
          13       0.75      0.43      0.55        14
          14       0.71      0.67      0.69        18
          15       0.56      0.92      0.70        24
          16       0.78      0.50      0.61        28
          17       0.42      0.53      0.47        19
          18       0.67    

In [None]:
# seems like the best we can get is 59% accuracy 
# we might have to explore CNN 

# - https://www.youtube.com/watch?v=7HPwo4wnJeA
# - https://www.codemag.com/Article/2205081/Implementing-Face-Recognition-Using-Deep-Learning-and-Support-Vector-Machines
# - https://thinkingneuron.com/face-recognition-using-deep-learning-cnn-in-python/


In [None]:
break

### Hyperparameter Tuning (SVM vs Random Forest vs Logistic Regression)

In [None]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto',probability=True),
        'params' : {
            'svc__C': [1,10,100,1000],
            'svc__kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'randomforestclassifier__n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'logisticregression__C': [1,5,10]
        }
    }
}

### Creating a dataframe to compare the best performing model

In [None]:
scores = []
best_estimators = {}
import pandas as pd
for algo, mp in model_params.items():
    pipe = make_pipeline(StandardScaler(), mp['model'])
    clf =  GridSearchCV(pipe, mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': algo,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    best_estimators[algo] = clf.best_estimator_
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

In [None]:
best_estimators

### Validation set vs Test data set

![image.png](attachment:720a985d-1abb-4f4b-9d49-08539c3982d5.png)

![image.png](attachment:245fcbb4-8aa1-4125-a8e1-7b555cf22fc3.png)

In [None]:
break

### Testing the model performance with the test data set (not the validation data set as done in the hyperparemeter tuning step)

In [None]:
best_estimators['svm'].score(X_test,y_test)

In [None]:
best_estimators['random_forest'].score(X_test,y_test)

In [None]:
best_estimators['logistic_regression'].score(X_test,y_test)

### Choosing the best performing model

In [None]:


best_clf = best_estimators['svm']


In [None]:
best_clf.predict(X_test)

In [None]:
X_test[5]


In [None]:
best_clf.predict(X_test[5].reshape(1,-1))

### Plotting the confusion matrix

In [None]:


from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, best_clf.predict(X_test))
cm



In [None]:

plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')

In [None]:
class_dict

### Saving the model as pickle file  and create a class dictionary as a JSON file

In [None]:
# Save the model as a pickle in a file 
import pickle

pickle.dump(best_clf, open('saved_model.pkl', 'wb'))

In [None]:
with open("class_dictionary.json","w") as f:
    f.write(json.dumps(class_dict))

