# Load Libraries

In [None]:
#Numerics
import pandas as pd
import numpy as np

#Python
import pickle
import time

#Machine Learning
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

#Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Load Datasets and Models
(Be careful of the sizes)

In [None]:
X_train = pd.read_pickle('X_train_phy.pkl') # train with features close to those 
#X_train = pd.read_pickle('X_train_pca.pkl') # train with principle components of dataset
y_train = pd.read_pickle('y_train.pkl')
#X_test = pd.read_pickle('X_test_phy.pkl')
#X_test = pd.read_pickle('X_test_pca.pkl')
#y_test = pd.read_pickle('y_test.pkl')

In [None]:
def read_model_file(filename = 'model.pkl'):
    with open(filename, 'rb') as f:
        model = pickle.load(f)
    #print(f"model загрука в файл: {filename}")
    print("Load model: ", filename)
    print(model) # this is how to figure out what object is stored here

In [None]:
# load any number of models into a list
model = []
#model.append(read_model_file('tree_model.pkl'))

# Find models

score:
f1 seems like the best option. It is a standard measure which works on binary classifiers with uneven class sizes. Trying something better would require much more time and energy.

classifiers:
 -- I currently have trees on the PCA input, and am running such random forest. I believe that we should try the trees, but without PCA. We need to decide on minimum sample size and max depth. With such a large dataset, the minimum sample size should be large.

 -- kNN seems worthless in our case. It is not very useful for large datasets, so we need to have a reason to believe it is useful, and a means to shrink the dataset (like averaging nearby points).

 -- Naive Bayes would be good to run on the output of PCA. In this case, PCA removes the correlation, making it very useful, while the physical relevance of the features is not actually helpful.

 -- SVM seems to have a problem with the large sample size. I would like to run it, if possible, with a non-linear kernel, but I need to figure out the runtime issue. I would consider this an "if we have time" option -- analyze everything without it.
https://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use

 -- We could try some other ensemble model version, if someone is creative enough to think of one. Splitting the data and averaging the results is easy enough -- just don't use the same sample too many times (1 time is prefered). These ensemble versions could enable us to take advantage of the large number of samples, rather than causing them to slow down the evaluation. 
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier

In [None]:
# Decision Tree

# parameters to provide
tree_param_grid = {'max_depth': [3, 5, 10], 'min_samples_split': [5, 10, 15]}
scoring         = 'f1'
model_filename  = f"tree_{scoring}_{tree_param_grid['max_depth'][-1]}_{tree_param_grid['min_sample_size'][0]}.pkl"
n_jobs          = 1      # put number of processors here
cv              = 5

# run model, with timing
t_start = time.time()
test_model = GridSearchCV(DecisionTreeClassifier(), tree_param_grid, scoring = scoring, n_jobs = n_jobs, cv=cv)
test_model.fit(X_train, y_train)
t_finish = time.time()
dt = t_finish - t_start
dt2 = dt / 60.
print("Decision Tree Best Params:", test_model.best_params_)
print("Time: ",  dt, " sec or ", dt2, "min")

# Сохранение модели в файл
with open(model_filename, 'wb') as file:
    pickle.dump(test_model, file)
print(f"Модель сохранена в файл: {model_filename}")

#save model in the model array
#model.append(test_model)

In [None]:
# Random Forest

# parameters to provide
rf_param_grid   = {'n_estimators': [10, 50, 100, 200], 'max_depth': [5, 10, 15], 'min_samples_split': [3, 5, 10]}
scoring         = 'f1'
model_filename  = f"forest_{scoring}_{rf_param_grid['max_depth'][-1]}_{rf_param_grid['min_sample_size'][0]}.pkl"
n_jobs          = 8      # put number of processors here
cv              = 5

# run model, with timing
t_start = time.time()
test_model = GridSearchCV(RandomForestClassifier(), rf_param_grid, scoring = scoring, n_jobs = n_jobs, cv=cv)
test_model.fit(X_train, y_train)
t_finish = time.time()
dt = t_finish - t_start
dt2 = dt / 60.
print("Random Forest Best Params:", test_model.best_params_)
print("Time: ",  dt, " sec or ", dt2, "min")

# Сохранение модели в файл
with open(model_filename, 'wb') as file:
    pickle.dump(test_model, file)
print(f"Модель сохранена в файл: {model_filename}")

#save model in the model array
#model.append(test_model)

In [None]:
# Support Vector Machine (SVM)
# Don't run -- requires too much time

# parameters to provide
svm_param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
scoring         = 'f1'
model_filename  = f"svm_model_{scoring}}.pkl"
n_jobs          = 1      # put number of processors here
cv              = 5

# run model, with timing
t_start = time.time()
test_model = GridSearchCV(SVC(), svm_param_grid, scoring = scoring, n_jobs = n_jobs, cv=cv)
test_model.fit(X_train, y_train)
t_finish = time.time()
dt = t_finish - t_start
dt2 = dt / 60.
print("SVC Best Params:", test_model.best_params_)
print("Time: ",  dt, " sec or ", dt2, "min")

# Сохранение модели в файл
with open(model_filename, 'wb') as file:
    pickle.dump(test_model, file)
print(f"Модель сохранена в файл: {model_filename}")

#save model in the model array
#model.append(test_model)

In [None]:
# Naive Bayes

# parameters to provide
model_filename  = f"bayes_model.pkl"

# run model, with timing
t_start = time.time()
test_model = GaussianNB()
test_model.fit(X_train, y_train)
t_finish = time.time()
dt = t_finish - t_start
dt2 = dt / 60.
print("Time: ",  dt, " sec or ", dt2, "min")

# Сохранение модели в файл
with open(model_filename, 'wb') as file:
    pickle.dump(test_model, file)
print(f"Модель сохранена в файл: {model_filename}")

#save model in the model array
#model.append(test_model)

# Test Models