In [25]:
from sklearn import svm
from sklearn.metrics import accuracy_score
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, ParameterGrid
import pandas as pd
from sklearn import ensemble

In [26]:
def convert_sample(image):
    image = tf.image.rgb_to_grayscale(image)
    image = tf.image.resize(image,[72,72]).numpy()
    image = image.reshape(1,-1)
    return image

In [27]:
X = np.load('Xtrain.npy')
print(X.shape)
X = np.vstack(list(map(convert_sample,X)))
X = StandardScaler(with_mean=0, with_std=1).fit_transform(X)
print(f'Shape of training data features (observations,features): {X.shape}')

y = np.load('ytrain.npy')
y = y.reshape(-1,)    
print(f'Shape of training data labels (observations,): {y.shape}')

Xtest = np.load('Xtest.npy')
Xtest = np.vstack(list(map(convert_sample,Xtest)))
Xtest = StandardScaler(with_mean=0, with_std=1).fit_transform(Xtest)
print(f'Shape of training data features (observations,features): {Xtest.shape}')

(26214, 96, 96, 3)




Shape of training data features (observations,features): (26214, 5184)
Shape of training data labels (observations,): (26214,)
Shape of training data features (observations,features): (1638, 5184)




In [28]:
X_train, X_val, y_train, y_val = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [29]:
subset_train = np.random.choice(X_train.shape[0], 1500, replace=False)
subset_val = np.random.choice(X_val.shape[0], 500, replace=False)

X_train_sub = X_train[subset_train]
y_train_sub = y_train[subset_train]

X_val_sub = X_val[subset_val]
y_val_sub = y_val[subset_val]


In [30]:
kernels = ['linear', 'poly', 'rbf'] 
Cs = [0.1, 1.0, 2.5, 5.0, 10.0, 100.0] 
Ds = [1,2,3,4,5,6]
gamma = [0.1, 1, 10]


results_svm = []

for kernel in kernels:
    if kernel == "linear":
        for C in Cs:
                print(f"Training SVM with kernel = {kernel} and C = {C}...")
                svm_current = svm.SVC(kernel=kernel, C=C, gamma="scale")
                svm_current.fit(X_train_sub, y_train_sub)
                y_val_hat = svm_current.predict(X_val_sub)
                accuracy = accuracy_score(y_val_hat, y_val_sub)

                results_svm.append([accuracy, kernel, C])

    elif kernel == "rbf": 
        for C in Cs:
            for g in gamma: 
                print(f"Training SVM with kernel = {kernel}, gamma = {g} and C = {C}...")
                svm_current = svm.SVC(kernel=kernel, C=C, gamma=g)
                svm_current.fit(X_train_sub, y_train_sub)
                y_val_hat = svm_current.predict(X_val_sub)
                accuracy = accuracy_score(y_val_hat, y_val_sub)

                results_svm.append([accuracy, kernel, C, g])


    elif kernel == "poly": 
         for C in Cs: 
              for d in Ds: 
                for g in gamma: 
                    print(f"Training SVM with kernel = {kernel} degree = {d}, gamma = {g} and C = {C}...")
                    svm_current = svm.SVC(kernel=kernel, C=C, degree=d, gamma=g)
                    svm_current.fit(X_train_sub, y_train_sub)
                    y_val_hat = svm_current.predict(X_val_sub)
                    accuracy = accuracy_score(y_val_hat, y_val_sub)

                    results_svm.append([accuracy, kernel, C, d, g])
                    


results_svm = pd.DataFrame(results_svm)
results_svm.columns = ['Accuracy', 'Kernel', 'C', "d", "g"]
print(results_svm)

In [31]:
results[results['Accuracy'] == results['Accuracy'].max()]

In [32]:
# svm_best = svm.SVC(kernel='rbf', C = 1.0)

# # Use both training and validation data to fit it (np.concatenate "stacks" the array like rbind in R)
# svm_best.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))



In [33]:
# Predict on test data
# ytest_hat = svm_best.predict(Xtest)



In [34]:
# ytest_hat= pd.DataFrame({
#     'Id': list(range(len(ytest_hat))),
#     'Predicted': ytest_hat.reshape(-1,),
# })
# ytest_hat.to_csv("ytest_hat.csv", index=False)

In [35]:
max_depth = [None, 2, 10, 20]
n_estimators_list = [50, 100, 300]
min_samples_split_list = [3, 5, 15, 33]
min_samples_leaf_list = [3, 5, 15, 33]

results_rf = []
for d in max_depth:
    for n_estimators in n_estimators_list:
        for min_samples_split in min_samples_split_list:
            for min_samples_leaf in min_samples_leaf_list:
                print(f"Training for n_estimators={n_estimators}, min_samples_split={min_samples_split}, min_samples_leaf={min_samples_leaf}, max_depth={d}")

                rf_current = ensemble.RandomForestClassifier(
                    max_depth = d,
                    n_estimators=n_estimators,
                    min_samples_split=min_samples_split,
                    min_samples_leaf=min_samples_leaf,
                    )
                rf_current.fit(X_train_sub, y_train_sub)
                y_val_hat = rf_current.predict(X_val_sub)
                accuracy_rf = accuracy_score(y_val_sub, y_val_hat)

                results_rf.append([accuracy_rf, n_estimators, min_samples_split, min_samples_leaf, d])

results_rf = pd.DataFrame(results_rf)
results_rf.columns = ['accuracy_rf', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'max_depth']
print(results_rf)
results_rf[results_rf['accuracy_rf'] == results_rf['accuracy_rf'].max()]

Training for n_estimators=50, min_samples_split=3, min_samples_leaf=3, max_depth=None
Training for n_estimators=50, min_samples_split=3, min_samples_leaf=5, max_depth=None
Training for n_estimators=50, min_samples_split=3, min_samples_leaf=15, max_depth=None
Training for n_estimators=50, min_samples_split=3, min_samples_leaf=33, max_depth=None
Training for n_estimators=50, min_samples_split=5, min_samples_leaf=3, max_depth=None
Training for n_estimators=50, min_samples_split=5, min_samples_leaf=5, max_depth=None
Training for n_estimators=50, min_samples_split=5, min_samples_leaf=15, max_depth=None
Training for n_estimators=50, min_samples_split=5, min_samples_leaf=33, max_depth=None
Training for n_estimators=50, min_samples_split=15, min_samples_leaf=3, max_depth=None
Training for n_estimators=50, min_samples_split=15, min_samples_leaf=5, max_depth=None
Training for n_estimators=50, min_samples_split=15, min_samples_leaf=15, max_depth=None
Training for n_estimators=50, min_samples_spli

Unnamed: 0,accuracy_rf,n_estimators,min_samples_split,min_samples_leaf,max_depth
17,0.762,100,3,5,
29,0.762,100,33,5,
40,0.762,300,15,3,
184,0.762,300,15,3,20.0


In [36]:
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.ensemble import RandomForestClassifier

# param_dist = {
#     'max_depth': [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
#     'n_estimators': [20, 50, 100, 200, 300],
#     'min_samples_split': list(range(2, 30)),
#     'min_samples_leaf': list(range(2, 30))
# }


# rf = RandomForestClassifier()


# n_iter_search = 500
# random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=n_iter_search, cv=5, verbose=2, n_jobs=-1, random_state=42)
# random_search.fit(X_train_sub, y_train_sub)

# print("Best parameters found: ", random_search.best_params_)
# print("Best cross-validation score: {:.2f}".format(random_search.best_score_))

