**Imports:**

In [21]:
from sklearn import svm
from sklearn.metrics import accuracy_score
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, ParameterGrid
import pandas as pd
from sklearn import tree
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from scipy.stats import mode
from sklearn.decomposition import PCA

**Get acess to the data from google drive:**

In [22]:
#from google.colab import drive
#drive.mount('/content/gdrive')

**Defining a function that grayscale, resize and flattens the image:**

In [23]:
def convert_sample(image):
    image = tf.image.rgb_to_grayscale(image)
    image = tf.image.resize(image,[32,32]).numpy()
    image = image.reshape(1,-1)
    return image

**Create X, y and Xtest - the function convert_sample is used:**

In [24]:
X = np.load('C:/Users/mikke/OneDrive - Syddansk Universitet/Data Science/10. Anvendt Maskinlæring/project/Assignment 1/Xtrain.npy')
X = np.vstack(list(map(convert_sample,X)))
X = StandardScaler(with_mean=0, with_std=1).fit_transform(X)
print(f'Shape of training data features (observations,features): {X.shape}')

y = np.load('C:/Users/mikke/OneDrive - Syddansk Universitet/Data Science/10. Anvendt Maskinlæring/project/Assignment 1/ytrain.npy')
y = y.reshape(-1,)
print(f'Shape of training data labels (observations,): {y.shape}')

Xtest = np.load('C:/Users/mikke/OneDrive - Syddansk Universitet/Data Science/10. Anvendt Maskinlæring/project/Assignment 1/Xtest.npy')
Xtest = np.vstack(list(map(convert_sample,Xtest)))
Xtest = StandardScaler(with_mean=0, with_std=1).fit_transform(Xtest)
print(f'Shape of test data features (observations,features): {Xtest.shape}')



Shape of training data features (observations,features): (26214, 1024)
Shape of training data labels (observations,): (26214,)
Shape of test data features (observations,features): (1638, 1024)




**Compute the explained variance over the principal components:**

**We want to explain 95 pct. of the variance:**

**Reduce the dimentions to the number of principal components that explain 0.95 pct. of the variance:**

**Split in train/val for hyperparameter search:**

In [25]:
X_reduced, _, y_reduced, _ = train_test_split(X, y, test_size=0.90, random_state=42)

X_train_h, X_val_h, y_train_h, y_val_h = train_test_split(X_reduced, y_reduced, test_size=0.2, random_state=42)

**Create 9 ensambles:**

In [26]:
num_ensembles = 9
samples_per_ensemble = X.shape[0] // num_ensembles

#np.random.seed(42)

for i in range(1, 10):

    indices = np.random.choice(X.shape[0], samples_per_ensemble, replace=False)

    X_subset = X[indices]
    y_subset = y[indices]

    globals()[f'X_{i}'] = X_subset
    globals()[f'y_{i}'] = y_subset

    X = np.delete(X, indices, axis=0)
    y = np.delete(y, indices, axis=0)

**Print the shape of the ensambles:**

In [27]:
for i in range(1,10):
    X = globals()[f'X_{i}']
    y = globals()[f'y_{i}']
    print(X.shape)
    print(y.shape)

(2912, 1024)
(2912,)
(2912, 1024)
(2912,)
(2912, 1024)
(2912,)
(2912, 1024)
(2912,)
(2912, 1024)
(2912,)
(2912, 1024)
(2912,)
(2912, 1024)
(2912,)
(2912, 1024)
(2912,)
(2912, 1024)
(2912,)


**Hyperparameter search SVM:**

In [28]:

param_grid = ParameterGrid({
    'C': [0.1, 0.25, 0.5, 0.75, 1, 10], })

score_ = 0
params_ = None
for params in param_grid:
  svm_rbf = svm.SVC(kernel='rbf', C=params['C'])
  svm_rbf.fit(X_train_h, y_train_h)
  y_val_hat_rbf = svm_rbf.predict(X_val_h)
  accuracy_rbf = accuracy_score(y_val_hat_rbf, y_val_h)
  if accuracy_rbf > score_:
    score_ = accuracy_rbf
    params_ = params


In [29]:
print(f"For SVM - using radial and the best hyperparameter C: {params_}, the accuracy of the model is: {accuracy_rbf}")

For SVM - using radial and the best hyperparameter C: {'C': 0.5}, the accuracy of the model is: 0.7219047619047619


**Hyperparameter search RF:**

In [30]:

param_grid = ParameterGrid({
    'n_estimators': [500, 1000],
    'min_samples_split': [10, 20, 50],
    'min_samples_leaf': [10, 20, 50],
    'max_depth': [None, 300, 200],
    'max_features': ['sqrt', 'log2']
    })

score_ = 0
params_ = None
for params in param_grid:
  rf_current = ensemble.RandomForestClassifier(
                n_estimators=params['n_estimators'],
                min_samples_split=params['min_samples_split'],
                min_samples_leaf=params['min_samples_leaf'],
                max_depth=params['max_depth'],
                max_features=params['max_features'],
                )
  rf_current.fit(X_train_h, y_train_h)
  y_val_hat = rf_current.predict(X_val_h)
  accuracy = accuracy_score(y_val_hat, y_val_h)
  if accuracy > score_:
    score_ = accuracy
    params_ = params


In [31]:
print(f"For RF the best hyperparameters are: {params_}, the accuracy of the model is: {accuracy}")

For RF the best hyperparameters are: {'max_depth': 300, 'max_features': 'log2', 'min_samples_leaf': 10, 'min_samples_split': 50, 'n_estimators': 500}, the accuracy of the model is: 0.7314285714285714


**Hyperparameter search GB:**

In [32]:

param_grid = ParameterGrid({
    'n_estimators': [500, 1000],
    'min_samples_split': [10, 20, 50],
    'min_samples_leaf': [10, 20, 50],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 0.75],
    })

score_ = 0
params_ = None

for params in param_grid:
  gbt_current = ensemble.GradientBoostingClassifier(
                n_estimators=params['n_estimators'],
                min_samples_split=params['min_samples_split'],
                min_samples_leaf=params['min_samples_leaf'],
                learning_rate=params['learning_rate'],
                )
  gbt_current.fit(X_train_h, y_train_h)
  y_val_hat = gbt_current.predict(X_val_h)
  accuracy = accuracy_score(y_val_hat, y_val_h)
  if accuracy > score_:
    score_ = accuracy
    params_ = params


In [33]:
print(f"For GB the best hyperparameters are: {params_}, the accuracy of the model is: {accuracy}")

For GB the best hyperparameters are: {'learning_rate': 0.1, 'min_samples_leaf': 20, 'min_samples_split': 50, 'n_estimators': 1000}, the accuracy of the model is: 0.7561904761904762


**Initiate the models:**

In [34]:

svm1 = svm.SVC(kernel='rbf', C=0.75)

rf1 = ensemble.RandomForestClassifier(
                n_estimators=500,
                min_samples_split=50,
                min_samples_leaf=10,
                max_depth=300,
                max_features='sqrt',
                )

gb1 = ensemble.GradientBoostingClassifier(
                n_estimators=500,
                min_samples_split=10,
                min_samples_leaf=20,
                learning_rate=0.05,
                )

models1 = [svm1, rf1, gb1]


In [35]:

svm2 = svm.SVC(kernel='rbf', C=1.0)

rf2 = ensemble.RandomForestClassifier(
    n_estimators=800,
    min_samples_split=40,
    min_samples_leaf=5,
    max_depth=200,
    max_features='log2',
)

gb2 = ensemble.GradientBoostingClassifier(
    n_estimators=800,
    min_samples_split=20,
    min_samples_leaf=10,
    learning_rate=0.1,
)

models2 = [svm2, rf2, gb2]


In [36]:

svm3 = svm.SVC(kernel='rbf', C=0.5)

rf3 = ensemble.RandomForestClassifier(
    n_estimators=1000,
    min_samples_split=30,
    min_samples_leaf=2,
    max_depth=700,
    max_features=20,
)

gb3 = ensemble.GradientBoostingClassifier(
    n_estimators=1000,
    min_samples_split=40,
    min_samples_leaf=5,
    learning_rate=0.15,
)

models3 = [svm3, rf3, gb3]

In [37]:

svm4 = svm.SVC(kernel='rbf', C=1.5)

rf4 = ensemble.RandomForestClassifier(
    n_estimators=1200,
    min_samples_split=20,
    min_samples_leaf=1,
    max_depth=400,
    max_features=50,
)

gb4 = ensemble.GradientBoostingClassifier(
    n_estimators=1200,
    min_samples_split=5,
    min_samples_leaf=30,
    learning_rate=0.01,
)

models4 = [svm4, rf4, gb4]

In [38]:

svm5 = svm.SVC(kernel='rbf', C=1.2)

rf5 = ensemble.RandomForestClassifier(
    n_estimators=1500,
    min_samples_split=2,
    min_samples_leaf=4,
    max_depth=150,
    max_features='log2',
)

gb5 = ensemble.GradientBoostingClassifier(
    n_estimators=750,
    min_samples_split=30,
    min_samples_leaf=15,
    learning_rate=0.1,
)

models5 = [svm5, rf5, gb5]


In [39]:

svm6 = svm.SVC(kernel='rbf', C=0.9)

rf6 = ensemble.RandomForestClassifier(
    n_estimators=1700,
    min_samples_split=10,
    min_samples_leaf=5,
    max_depth=250,
    max_features=150,
)

gb6 = ensemble.GradientBoostingClassifier(
    n_estimators=1100,
    min_samples_split=50,
    min_samples_leaf=5,
    learning_rate=0.05,
)

models6 = [svm6, rf6, gb6]


In [40]:

svm7 = svm.SVC(kernel='rbf', C=0.8)

rf7 = ensemble.RandomForestClassifier(
    n_estimators=2000,
    min_samples_split=20,
    min_samples_leaf=1,
    max_depth=350,
    max_features=200,
)

gb7 = ensemble.GradientBoostingClassifier(
    n_estimators=300,
    min_samples_split=15,
    min_samples_leaf=10,
    learning_rate=0.2,
)

models7 = [svm7, rf7, gb7]


In [41]:

svm8 = svm.SVC(kernel='rbf', C=1.3)

rf8 = ensemble.RandomForestClassifier(
    n_estimators=1700,
    min_samples_split=5,
    min_samples_leaf=2,
    max_depth=400,
    max_features='sqrt',
)

gb8 = ensemble.GradientBoostingClassifier(
    n_estimators=900,
    min_samples_split=25,
    min_samples_leaf=7,
    learning_rate=0.15,
)

models8 = [svm8, rf8, gb8]


In [42]:

svm9 = svm.SVC(kernel='rbf', C=0.6)

rf9 = ensemble.RandomForestClassifier(
    n_estimators=1250,
    min_samples_split=30,
    min_samples_leaf=3,
    max_depth=450,
    max_features=50,
)

gb9 = ensemble.GradientBoostingClassifier(
    n_estimators=800,
    min_samples_split=35,
    min_samples_leaf=20,
    learning_rate=0.1,
)

models9 = [svm9, rf9, gb9]


In [43]:
df_ensemble = pd.DataFrame(columns=['df1', 'df2', 'df3', 'df4', 'df5', 'df6', 'df7', 'df8', 'df9'])

for i in range(1, 10):

  globals()[f'df{i}'] = pd.DataFrame(columns=['svm', 'rf', 'gb'])

  X = globals()[f'X_{i}']
  y = globals()[f'y_{i}']

  current_df = globals()[f'df{i}']

  column_index = 0

  models = globals()[f'models{i}']

  for model in models:
    z = model.fit(X, y)
    q = z.predict(Xtest)

    column_name = current_df.columns[column_index]
    current_df[column_name] = q
    column_index += 1

def calculate_majority(row):
  counts = pd.Series(row).value_counts()
  majority = counts.idxmax()
  return majority

column_index = 0

for i in range(1, 10):

  d = globals()[f'df{i}']
  d['majority'] = d.apply(calculate_majority, axis=1)

  df_ensemble.iloc[:, column_index] = d['majority']
  column_index += 1

df_ensemble['majority'] = df_ensemble.apply(calculate_majority, axis=1)


**Pred to CSV:**

In [44]:

df_ensemble = pd.DataFrame({
    'Id': range(len(df_ensemble)),
    'Predicted': df_ensemble['majority'].values,
})

path_on_drive = 'C:/Users/mikke/OneDrive - Syddansk Universitet/Data Science/10. Anvendt Maskinlæring/project/Assignment 1/pred.csv'
df_ensemble.to_csv(path_on_drive, index=False)