# Load Libraries

In [6]:
#Numerics
import pandas as pd
import numpy as np

#Python
import pickle
import time

#Machine Learning
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

#Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Load Datasets and Models
(Be careful of the sizes)

In [18]:
#X_train = pd.read_pickle('X_train_phy.pkl') # train with features close to those 
with open('X_train_pca.pkl', 'rb') as f:
    X_train = pickle.load(f) # train with principle components of dataset
y_train = pd.read_pickle('y_train.pkl')

#X_test = pd.read_pickle('X_test_phy.pkl')
with open('X_test_pca.pkl', 'rb') as f:
    X_test = pickle.load(f) # train with principle components of dataset
y_test = pd.read_pickle('y_test.pkl')

In [10]:
def read_model_file(filename = 'rf_f1.pkl'):
    with open(filename, 'rb') as f:
        model=pickle.load(f)
    #print(f"model загрука в файл: {filename}")
    print("Load model: ", filename)
    print(model) # this is how to figure out what object is stored here
    return model

In [12]:
# load any number of models into a list
model = []
#model.append(read_model_file('tree_model.pkl'))

In [14]:
model.append(read_model_file('rf_f1.pkl'))
model.append(read_model_file('tree_model_acc_10_50.pkl'))
model.append(read_model_file('tree_model_pre_10_200.pkl'))
model.append(read_model_file('tree1_model.pkl'))
model.append(read_model_file('tree2_model.pkl'))
model.append(read_model_file('tree3_model.pkl'))
model.append(read_model_file('tree5_model.pkl'))

Load model:  rf_f1.pkl
GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=8,
             param_grid={'max_depth': [5, 10, 15],
                         'min_samples_split': [100, 1000, 10000],
                         'n_estimators': [10, 50, 100]},
             scoring='f1')
Load model:  tree_model_acc_10_50.pkl
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [5, 10],
                         'min_samples_split': [10, 20, 50, 200]},
             scoring='balanced_accuracy')
Load model:  tree_model_pre_10_200.pkl
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [5, 10],
                         'min_samples_split': [10, 20, 50, 200]},
             scoring='precision')
Load model:  tree1_model.pkl
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'max_depth': [3, 5, 10],
                         'min_samples_split': [5, 10, 15]},
             

# Find models

score:
f1 seems like the best option. It is a standard measure which works on binary classifiers with uneven class sizes. Trying something better would require much more time and energy.

classifiers:
 -- I currently have trees on the PCA input, and am running such random forest. I believe that we should try the trees, but without PCA. We need to decide on minimum sample size and max depth. With such a large dataset, the minimum sample size should be large.

 -- kNN seems worthless in our case. It is not very useful for large datasets, so we need to have a reason to believe it is useful, and a means to shrink the dataset (like averaging nearby points).

 -- Naive Bayes would be good to run on the output of PCA. In this case, PCA removes the correlation, making it very useful, while the physical relevance of the features is not actually helpful.

 -- SVM seems to have a problem with the large sample size. I would like to run it, if possible, with a non-linear kernel, but I need to figure out the runtime issue. I would consider this an "if we have time" option -- analyze everything without it.
https://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use

 -- We could try some other ensemble model version, if someone is creative enough to think of one. Splitting the data and averaging the results is easy enough -- just don't use the same sample too many times (1 time is prefered). These ensemble versions could enable us to take advantage of the large number of samples, rather than causing them to slow down the evaluation. 
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier

Decision tree doesn't work well. SVM doesn't work at all? Could try letting the tree go deep. Or perhaps we should reorder the categorical Policy_Sales_Channel based on response probability. There is no reason to assume that the order as presented means anything. Categorical Naive Bayes should work well for our data (although causes problems for interpolation or extrapolation), given the low number of potential inputs and large number of datapoints for statistical analysis. Better solutions exist (like splitting the features, averaging clusters for k-NN, ...), but I don't want to do anything too complicated.

In [None]:
# Decision Tree

# parameters to provide
tree_param_grid = {'max_depth': [3, 5, 10], 'min_samples_split': [5, 10, 15]}
scoring         = 'f1'
model_filename  = f"tree_{scoring}_{tree_param_grid['max_depth'][-1]}_{tree_param_grid['min_sample_size'][0]}.pkl"
n_jobs          = 1      # put number of processors here
cv              = 5

# run model, with timing
t_start = time.time()
test_model = GridSearchCV(DecisionTreeClassifier(), tree_param_grid, scoring = scoring, n_jobs = n_jobs, cv=cv)
test_model.fit(X_train, y_train)
t_finish = time.time()
dt = t_finish - t_start
dt2 = dt / 60.
print("Decision Tree Best Params:", test_model.best_params_)
print("Time: ",  dt, " sec or ", dt2, "min")

# Сохранение модели в файл
with open(model_filename, 'wb') as file:
    pickle.dump(test_model, file)
print(f"Модель сохранена в файл: {model_filename}")

#save model in the model array
#model.append(test_model)

In [None]:
# Random Forest

# parameters to provide
rf_param_grid   = {'n_estimators': [10, 50, 100, 200], 'max_depth': [5, 10, 15], 'min_samples_split': [3, 5, 10]}
scoring         = 'f1'
model_filename  = f"forest_{scoring}_{rf_param_grid['max_depth'][-1]}_{rf_param_grid['min_sample_size'][0]}.pkl"
n_jobs          = 8      # put number of processors here
cv              = 5

# run model, with timing
t_start = time.time()
test_model = GridSearchCV(RandomForestClassifier(), rf_param_grid, scoring = scoring, n_jobs = n_jobs, cv=cv)
test_model.fit(X_train, y_train)
t_finish = time.time()
dt = t_finish - t_start
dt2 = dt / 60.
print("Random Forest Best Params:", test_model.best_params_)
print("Time: ",  dt, " sec or ", dt2, "min")

# Сохранение модели в файл
with open(model_filename, 'wb') as file:
    pickle.dump(test_model, file)
print(f"Модель сохранена в файл: {model_filename}")

#save model in the model array
#model.append(test_model)

In [None]:
# Support Vector Machine (SVM)
# Don't run -- requires too much time
# Maybe C is incorrect. ... 

# parameters to provide
svm_param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
scoring         = 'f1'
model_filename  = f"svm_model_{scoring}.pkl"
n_jobs          = 1      # put number of processors here
cv              = 5

# run model, with timing
t_start = time.time()
test_model = GridSearchCV(SVC(), svm_param_grid, scoring = scoring, n_jobs = n_jobs, cv=cv)
test_model.fit(X_train, y_train)
t_finish = time.time()
dt = t_finish - t_start
dt2 = dt / 60.
print("SVC Best Params:", test_model.best_params_)
print("Time: ",  dt, " sec or ", dt2, "min")

# Сохранение модели в файл
with open(model_filename, 'wb') as file:
    pickle.dump(test_model, file)
print(f"Модель сохранена в файл: {model_filename}")

#save model in the model array
#model.append(test_model)

In [None]:
# Naive Bayes

# parameters to provide
model_filename  = f"bayes_model.pkl"

# run model, with timing
t_start = time.time()
test_model = GaussianNB()
test_model.fit(X_train, y_train)
t_finish = time.time()
dt = t_finish - t_start
dt2 = dt / 60.
print("Time: ",  dt, " sec or ", dt2, "min")

# Сохранение модели в файл
with open(model_filename, 'wb') as file:
    pickle.dump(test_model, file)
print(f"Модель сохранена в файл: {model_filename}")

#save model in the model array
#model.append(test_model)

Our data consists of a finite number of categories for each numerical point, for which order may not mean anything. CategoricalNB may work well for the original dataset. 

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.CategoricalNB.html#sklearn.naive_bayes.CategoricalNB

# Test Models

In [25]:
for i in range(3):
    y_pred = model[i].predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      1.00      0.93   5705019
           1       0.51      0.01      0.02    799778

    accuracy                           0.88   6504797
   macro avg       0.69      0.50      0.48   6504797
weighted avg       0.83      0.88      0.82   6504797

[[5696201    8818]
 [ 790735    9043]]
              precision    recall  f1-score   support

           0       0.88      1.00      0.93   5705019
           1       0.32      0.00      0.00    799778

    accuracy                           0.88   6504797
   macro avg       0.60      0.50      0.47   6504797
weighted avg       0.81      0.88      0.82   6504797

[[5703304    1715]
 [ 798956     822]]
              precision    recall  f1-score   support

           0       0.88      1.00      0.93   5705019
           1       0.36      0.00      0.00    799778

    accuracy                           0.88   6504797
   macro avg       0.62      0.50      0.47   6504

In [27]:
y_pred = model[4].predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      1.00      0.93   5705019
           1       0.30      0.00      0.00    799778

    accuracy                           0.88   6504797
   macro avg       0.59      0.50      0.47   6504797
weighted avg       0.81      0.88      0.82   6504797

[[5702665    2354]
 [ 798771    1007]]


In [29]:
y_pred = model[5].predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
y_pred = model[6].predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.99      0.93   5705019
           1       0.39      0.04      0.07    799778

    accuracy                           0.87   6504797
   macro avg       0.64      0.52      0.50   6504797
weighted avg       0.82      0.87      0.83   6504797

[[5656178   48841]
 [ 767902   31876]]
              precision    recall  f1-score   support

           0       0.88      1.00      0.93   5705019
           1       0.42      0.02      0.04    799778

    accuracy                           0.88   6504797
   macro avg       0.65      0.51      0.49   6504797
weighted avg       0.82      0.88      0.82   6504797

[[5679890   25129]
 [ 781857   17921]]


In [31]:
print(model[5])
y_pred = model[5].predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [3, 5, 10, 20],
                         'min_samples_split': [50, 200, 500, 2000, 5000,
                                               20000]},
             scoring='f1')
              precision    recall  f1-score   support

           0       0.88      0.99      0.93   5705019
           1       0.39      0.04      0.07    799778

    accuracy                           0.87   6504797
   macro avg       0.64      0.52      0.50   6504797
weighted avg       0.82      0.87      0.83   6504797

[[5656178   48841]
 [ 767902   31876]]


The best one has a maximum depth of 20. Obviously I was wrong in wanting to limit the depth of the tree, as even on the test data it is getting better.