**Issue#4 - Traversal of the space of cross-validation folds.**

Fix for #4: https://github.com/mozilla/PRESC/issues/4


**References:**
1. https://scikit-learn.org

In [10]:
# Ignore all the future warning and deprecation warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
import seaborn as sns
pd.set_option("display.precision", 2) 

    
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from issue4_helper import cross_validation
from sklearn.svm import SVC

In [11]:
# Read the data
df = pd.read_csv("../../datasets/vehicles.csv")

In [12]:
# Set Feature and Label Column
feature_cols = ['COMPACTNESS', 'CIRCULARITY', 'DISTANCE_CIRCULARITY', 'RADIUS_RATIO',
       'PR.AXIS_ASPECT_RATIO', 'MAX.LENGTH_ASPECT_RATIO', 'SCATTER_RATIO',
       'ELONGATEDNESS', 'PR.AXIS_RECTANGULARITY', 'MAX.LENGTH_RECTANGULARITY',
       'SCALED_VARIANCE_MAJOR', 'SCALED_VARIANCE_MINOR',
       'SCALED_RADIUS_OF_GYRATION', 'SKEWNESS_ABOUT_MAJOR',
       'SKEWNESS_ABOUT_MINOR', 'KURTOSIS_ABOUT_MAJOR']
label_col = ['Class']

**Removing Outlier Rows**

In [13]:
# Max value based on boxplot to filter outliers of 8 columns where outliers are identified. 
df_columns_with_outliers = pd.DataFrame([[255, 77, 13, 288, 980, 88, 19, 40]], columns=['RADIUS_RATIO', 
                                                                                'PR.AXIS_ASPECT_RATIO', 
                                                                                'MAX.LENGTH_ASPECT_RATIO', 
                                                                                'SCALED_VARIANCE_MAJOR', 
                                                                                'SCALED_VARIANCE_MINOR', 
                                                                                'SKEWNESS_ABOUT_MAJOR', 
                                                                                'SKEWNESS_ABOUT_MINOR', 
                                                                                'KURTOSIS_ABOUT_MAJOR'])

total_outliers = 0
for i, column in enumerate(df_columns_with_outliers.columns):
    total_outliers += df[column][df[column] > df_columns_with_outliers[column][0]].size
print('Out of {} rows {} Outliers '.format(len(df.index), total_outliers))

Out of 846 rows 51 Outliers 


In [14]:
df_new = df.copy();

# Fill null
df_new.fillna(df_new.mean(), inplace=True)

# Remove outliers based on max value identified earlier from boxplot
for i, column in enumerate(df_columns_with_outliers.columns):
    df_new = df_new[df_new[column] < df_columns_with_outliers[column][0]]
 
df_new['Class'] = pd.Categorical(df_new['Class']).codes

# reset the index post cleaning the outliers
df_new = df_new.reset_index(drop=True)

**Tran/Test Split Testing**

We will run the training on our model by using different test-train split ratio and check the variation in accuracy. We want to check if the ratio has any relationship with the performance(accuracy) of the model.




In [15]:
from sklearn.model_selection import StratifiedKFold

param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  

X_new = df_new[feature_cols]
y_new = df_new[label_col]
scaler_sc = MinMaxScaler()
scaler_sc.fit(X_new)
X_new = scaler_sc.transform(X_new)

# Estimator - SVM
estimator_svm = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)

# Disable console messages
estimator_svm.verbose = False

# Run the test and print the results
# result_df = cross_validation(estimator_svm, X_new, y_new.values.ravel())


    Ratio(Train Data)  Ratio(Test Data)  Accuracy  F1-Score
0                0.90              0.10      0.73      0.73
1                0.85              0.15      0.74      0.74
2                0.80              0.20      0.78      0.77
3                0.75              0.25      0.79      0.79
4                0.70              0.30      0.83      0.82
5                0.65              0.35      0.82      0.81
6                0.60              0.40      0.78      0.78
7                0.55              0.45      0.79      0.79
8                0.50              0.50      0.77      0.76
9                0.45              0.55      0.76      0.75
10               0.40              0.60      0.75      0.75
11               0.35              0.65      0.76      0.75
12               0.30              0.70      0.75      0.75
13               0.25              0.75      0.74      0.73
14               0.20              0.80      0.68      0.59
15               0.15              0.85 