# TODO:
- After Preprocessing is fixed remove weird slicing and replace with columns of the ndarray.
- For the grid searches don't use CV, but the validation set instead (PreDefinedSplit from sklearn function should do the trick somehow)
- Reduce grid search?
- Complete evaluation
- Create output files?

In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

from helper.data_loading import load_subsample
from helper.preprocessing import preprocess_dataset

### Load train, validation and test set

In [None]:
train = load_subsample("data/train.csv")
X_train, y_train = preprocess_dataset(train, "pipeline/pipeline_components.pkl", load=True)

val = load_subsample("data/validation.csv")
X_val, y_val = preprocess_dataset(val, "pipeline/pipeline_components.pkl", load=True)

test = load_subsample("data/test.csv")
X_test, y_test = preprocess_dataset(test, "pipeline/pipeline_components.pkl", load=True)

In [None]:
y_train.shape

## Multi Layer Perceptron

### Model fitting

In [None]:
# When labels are fixed use the columns instead of this weird slicing
length = int(len(y_train)/4)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# Create the parameter grid
param_grid = {'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
              'activation': ['tanh', 'relu'],
              'solver': ['sgd', 'adam'],
              'alpha': [0.0001, 0.05],
              'learning_rate': ['constant','adaptive']
             }

# Use the random grid to search for best hyperparameters
# First create the base model to tune
model = MLPClassifier(max_iter=100)

# Search of parameters, using 3 fold cross validation, use all available cores
mlp = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=1, n_jobs=-1)

In [None]:
# Fit the grid search to the data
# CHANGE TO y_train WHEN IT IS THE CORRECT ND ARRAY, SHOULD PREDICT ALL THE LABELS NOW
mlp.fit(X_train, y_train[0:length])

In [None]:
# Best parameters
print(mlp.best_params_)
best = mlp.best_estimator_

# Prediction
y_pred = best.predict_proba(X_test)
y_pred[:,1]

## Evaluation MLP

In [None]:
from sklearn.metrics import roc_auc_score

## Random Forest

### Model fitting

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

### Model fitting
A random grid search is used to find hyperparameters for the Random Forest model.

In [None]:
# Create the parameter grid
random_grid = {'n_estimators': [50, 100, 500, 1000, 1500],
               'criterion': ['gini', 'entropy'],
               'max_depth': [10, 25, 50, 75, 100],
               'min_samples_split': [2, 3, 4, 5],
               'min_samples_leaf': [1, 2, 3, 4],
               'max_features': ['sqrt', 'log2'],
               'bootstrap': [True, False]
              }

# Use the random grid to search for best hyperparameters
# First create the base model to tune
model = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=100, cv=3, verbose=1, random_state=42, n_jobs=-1)

### Reply

In [None]:
# Fit the grid search to the data
rf.fit(X_train, y_train[0:length])

In [None]:
# Best parameters
print(rf.best_params_)
best_reply = rf.best_estimator_

# Prediction
y_pred_reply = best_reply.predict_proba(X_test)
y_pred_reply[:,1]

### Retweet

In [None]:
# Fit the grid search to the data
rf.fit(X_train, y_train[length:length*2])

In [None]:
# Best parameters
print(rf.best_params_)
best_retweet = rf.best_estimator_

# Prediction
y_pred_retweet = best_retweet.predict_proba(X_test)
y_pred_retweet[:,1]

### Retweet with comment

In [None]:
# Fit the grid search to the data
rf.fit(X_train, y_train[length*2:length*3])

In [None]:
# Best parameters
print(rf.best_params_)
best_comment = rf.best_estimator_

# Prediction
y_pred_comment = best_comment.predict_proba(X_test)
y_pred_comment[:,1]

### Like

In [None]:
# Fit the grid search to the data
rf.fit(X_train, y_train[length*3:length*4])

In [None]:
# Best parameters
print(rf.best_params_)
best_like = rf.best_estimator_

# Prediction
y_pred_like = best_like.predict_proba(X_test)
y_pred_like[:,1]

## Evaluation RF