In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
sns.set()

  from ._conv import register_converters as _register_converters


In [9]:
# these are the scores and the models I used previously
scores = 0.7810,0.63,0.763,0.828,0.702,0.837,0.830,0.802
models = 'k-NN','decision tree','random forest','svm linear','svm rbf','logistic','fc RNN','CNN'

In [12]:
# putting them in a DF
results_df = pd.DataFrame({'Models':models,'Scores':scores})
results_df
# Out of all the models the ones closer to the top are linear SVM, Logistic Regression and RNN
# I choose Losgistic regression both because it has the best result and
# because it is a simpler model

Unnamed: 0,Models,Scores
0,k-NN,0.781
1,decision tree,0.63
2,random forest,0.763
3,svm linear,0.828
4,svm rbf,0.702
5,logistic,0.837
6,fc RNN,0.83
7,CNN,0.802


In [19]:
# Load data
with np.load('cifar4-train.npz', allow_pickle=False) as data:
    cifar = dict(data.items())
with np.load('cifar4-test.npz',allow_pickle=False) as test:
    cifar_test = dict(test.items())
    
X = cifar['overfeat']
y = cifar['labels']

X_test = cifar_test['overfeat']

In [14]:
# Re-training the logistic regression with all the data

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import ParameterGrid, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
Logreg  =  SGDClassifier(
    # Set logistic loss
    loss='log',
    # Set max number of iterations and stopping criteria
    max_iter=1000, tol=1e-3, n_jobs = -1 # to use all cores
)

pca = PCA()

# one grid with a PCA step where i tune the alpha and the pca
grid =ParameterGrid({
    'logreg__alpha':[0.00001,0.0001,0.0005,0.001],
    'pca__n_components':np.arange(350,450,20)
    
})

#a second grid where i only tune the alpha as I wont use pca
grid1=ParameterGrid(
{
    'pca':[None],
    'logreg__alpha':[0.00001,0.0001,0.0005,0.001],

})

# Making lists to append the scores
val_scores_pca = []
val_scores = []

# The pipeline with the PCA as a step
pipe = Pipeline([
    ('pca',pca),
    ('logreg',Logreg)    
])

# Doing the grid search using the PCA pipe

for params_dict_pca in grid:
    pipe.set_params(**params_dict_pca)
    # 5-fold Cross-validation with n_jobs = -1 to use all the processors
    cv_results = cross_validate(pipe, X, y, cv=5, n_jobs=-1,
                                return_train_score=False)
    # Appending the criss-validation results in the previous dict to centralize
    params_dict_pca['mean val accuracy'] = cv_results['test_score'].mean()
    params_dict_pca['std of val accuracy'] = cv_results['test_score'].std()
    val_scores_pca.append(params_dict_pca)
    
# Doing the same for the grid without the PCA
for params_dict in grid1:
    pipe.set_params(**params_dict)
    
   
    cv_results = cross_validate(pipe, X, y, cv=5, n_jobs=-1,
                                return_train_score=False)

    params_dict['mean val accuracy'] = cv_results['test_score'].mean()
    params_dict['std of val accuracy'] = cv_results['test_score'].std()
    val_scores.append(params_dict)
    
# A DF with the hyperparameters and the results
scores_df = pd.DataFrame(val_scores)
scores_df = scores_df.sort_values(by='mean val accuracy',ascending = False)
scores_df = scores_df.reset_index(drop = True)

# With the PCA
scores_df_pca = pd.DataFrame(val_scores_pca)
scores_df_pca = scores_df_pca.sort_values(by='mean val accuracy',ascending = False)
scores_df_pca = scores_df_pca.reset_index(drop = True)

# finding the best score from both DFs  and putting it in a dict
if scores_df['mean val accuracy'][0] > scores_df_pca['mean val accuracy'][0]:
    best_dict = {'pca':[None],
                 'alpha':scores_df['logreg__alpha'][0],
                 'accuracy':scores_df['mean val accuracy'][0],
                 'std':scores_df['std of val accuracy'][0]
                }
else:
    best_dict = {'pca':scores_df_pca['pca__n_components'][0],
                 'alpha':scores_df_pca['logreg__alpha'][0],
                 'accuracy':scores_df_pca['mean val accuracy'][0],
                 'std':scores_df_pca['std of val accuracy'][0]
                }


In [20]:
# fitting the best parameters
Logreg  =  SGDClassifier(
    # Set logistic loss
    loss='log',
    max_iter=1000, tol=1e-3, n_jobs = -1, alpha = best_dict['alpha']
    
)

pca = PCA(n_components=best_dict['pca'])

pipe_best = Pipeline([
    ('pca',pca),
    ('logreg',Logreg)    
])
# fitting th
pipe_best.fit(X,y)

#printing the best results
print('Logistic Regression - top ccuracy across folds:',
      '{:.4f} (std: {:.4f}) with {} components and alpha {}'.format(
          best_dict['accuracy'],
          best_dict['std'],
          best_dict['pca'],
          best_dict['alpha']
          ))


Logistic Regression - top ccuracy across folds: 0.8384 (std: 0.0094) with 390 components and alpha 0.0005


In [23]:
preds = pipe_best.predict(X)

np.save('test-predictions',arr=preds,allow_pickle=False)