### Introduction

In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

# Importing library in order to see the progress bar
from tqdm.notebook import tnrange, tqdm_notebook
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
import time

# splitting our data into train and test sets
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV

In [23]:
X_train = pd.read_csv('train_vectorized.csv')
X_val = pd.read_csv('validation_vectorized.csv')
X_test = pd.read_csv('test_vectorized.csv')

In [24]:
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')
y_val = pd.read_csv('y_val.csv')

In [25]:
X_train.shape

(44274, 2000)

In [26]:
X_val.shape

(18975, 2000)

In [27]:
X_test.shape

(15813, 2000)

In [28]:
y_val.shape

(18975, 1)

In [29]:
y_train.shape

(44274, 1)

In [30]:
y_test.shape

(15813, 1)

### Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
### Logistic Regression TF-IDF vectorizer after including n-grams

# Initializing Scale
scaler_tfidf = RobustScaler()

# Fitting scale
scaler_tfidf.fit(X_train)

# Transform scale
X_train_scaled = scaler_tfidf.transform(X_train)
X_val_scaled = scaler_tfidf.transform(X_val)
X_test_scaled = scaler_tfidf.transform(X_test)

In [33]:
# instantiate model
logreg_model_tfidf = LogisticRegression(solver='lbfgs', max_iter=10000, random_state=1)

# fit the model
logreg_model_tfidf.fit(X_train_scaled, y_train)

In [34]:
# Training and test score
print(f"Train score: {logreg_model_tfidf.score(X_train_scaled, y_train)}")
print(f"Validation score: {logreg_model_tfidf.score(X_val_scaled, y_val)}")

Train score: 0.7697745855355288
Validation score: 0.7429249011857707


## PCA Model

In [16]:
# Chosing the range value and setting the step
range(0, X_train.shape[1])

range(0, 5000)

In [18]:
X_train.shape[1]

5000

In [15]:
estimators = [
    ('reduce_dim', PCA()),
    ('logreg_model', LogisticRegression())
]
pipe = Pipeline(estimators)

params = {
    'reduce_dim__n_components': [10, 1000],
    'logreg_model__C': [0.1, 1, 10],
    'logreg_model__solver': ['lbfgs']
}

In [16]:
grid_search_gbc = GridSearchCV(pipe, param_grid=params, cv = 5, n_jobs = -1, verbose = 2)
grid_search_gbc.fit(X_train_scaled, y_train)
# Wrap grid_search_gbc in tqdm to show progress bar
# grid_search_gbc = tqdm(grid_search_gbc, total=len(grid_search_gbc.cv))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


MemoryError: Unable to allocate 1.32 GiB for an array with shape (35419, 5000) and data type float64

In [15]:
# best score
print(f"best score: {grid_search_gbc.best_score_}")

# best parameters
print(f"best parameters: {grid_search_gbc.best_params_}")

best score: 0.6346385055413645
best parameters: {'logreg_model__C': 0.1, 'logreg_model__solver': 'lbfgs', 'reduce_dim__n_components': 20}
[CV] END logreg_model__C=0.1, logreg_model__solver=lbfgs, reduce_dim__n_components=20; total time=  34.9s
[CV] END logreg_model__C=1, logreg_model__solver=liblinear, reduce_dim__n_components=10; total time=  29.8s
[CV] END logreg_model__C=1, logreg_model__solver=liblinear, reduce_dim__n_components=20; total time=  34.8s
[CV] END logreg_model__C=0.1, logreg_model__solver=lbfgs, reduce_dim__n_components=20; total time=  34.9s
[CV] END logreg_model__C=1, logreg_model__solver=lbfgs, reduce_dim__n_components=20; total time=  34.3s
[CV] END logreg_model__C=10, logreg_model__solver=lbfgs, reduce_dim__n_components=20; total time=  32.6s
[CV] END logreg_model__C=0.1, logreg_model__solver=lbfgs, reduce_dim__n_components=20; total time=  34.8s
[CV] END logreg_model__C=1, logreg_model__solver=lbfgs, reduce_dim__n_components=20; total time=  34.2s
[CV] END logreg

In [None]:
# Initialize PCA
PCA_model = PCA()

In [None]:
# Fit PCA
PCA_model.fit(X_train_tfidf_scaled)

In [None]:
# Transform data
X_train_PCA_transform = PCA_model.transform(X_train_tfidf_scaled)
X_test_PCA_transform = PCA_model.transform(X_validation_tfidf_scaled)

In [None]:
for i in range(5):
    print(f"Variance captured by PC{i}: {PCA_model.explained_variance_[i]: 0.3f}")
    print(f"Proportion of variance captured by PC{i}: {PCA_model.explained_variance_ratio_[i]: 0.3f}")

### optimal number of Principal components

In [None]:
X_train_PCA_transform.shape

In [None]:
# Pull out the explained variance ratio for the BoW PCA Model
PCA_model_var = PCA_model.explained_variance_ratio_

# Calculate the cumulative sum of this array using the 
cumulative_sum = np.cumsum(PCA_model_var)

cumulative_sum

In [None]:
print(f'PC1: {PCA_model_var[0]: 0.3f}')
print(f'PC2: {PCA_model_var[1]: 0.3f}')

In [None]:
print(f'Optimal number of components {np.argmax(cumulative_sum > 0.9)}')

In [None]:
# Plot out the cumulative sum graph for the BoW PCA Model
plt.figure(figsize=(10,6))
plt.plot(range(0,3723), var_sum, marker='.')
plt.axhline(0.9, c='r', linestyle='--')
plt.axvline(1641, c='r', linestyle='--')
plt.xlabel('Number of PCs')
plt.ylabel('Cumulative Sum of Explained Variance - PCA Model')
plt.xticks(range(0,4000,300))
plt.show()

In [None]:
# Initialize PCA
base_PCA = PCA(n_components=0.9)

# Fit PCA
base_PCA.fit(X_train_scaled)

In [None]:
# Transform data
X_train_PCA_transform = base_PCA.transform(X_train_scaled)
X_val_PCA_transform = base_PCA.transform(X_validation_scaled)

In [None]:
#Lets check the shape of the PCA Transformed
print(f'Original: {X_train_scaled.shape}')
print(f'PCA Transformed: {X_train_PCA_transform.shape}')

Let's fit our logistic regression model and compare it with out baseline model

In [None]:
logreg_PCA_model = LogisticRegression()

logreg_PCA_model.fit(X_train_PCA_transform, y_train)

In [None]:
# Training and test score
print(f"Train score: {logreg_PCA_model.score(X_train_PCA_transform, y_train)}")
print(f"Validation score: {logreg_PCA_model.score(X_val_PCA_transform, y_val)}")

In [None]:
# Training and test score
print(f"Train score: {logreg.score(X_train_scaled, y_train)}")
print(f"Validation score: {logreg.score(X_validation_scaled, y_val)}")

### Manual Optimization

    1- LogisticRegression
    2- SVM (Support Vector Machines)
    3- Random Forrest (Decision Trees)
    4- XGBoost

In [None]:
# estimators = [('reduce_dim', PCA()),
#             ('logreg_model', LogisticRegression())]
# estimators = [
#     ('reduce_dim', PCA()),
#     ('logreg_model', LogisticRegression())
# ]
# pipe = Pipeline(estimators)

In [None]:
# grid_search_gbc.fit(X_train_transform.todense(), y_train)

In [35]:
# Setting up the estimators and pipeline
estimators = [
    ('scaler', RobustScaler()),
    ('model', LogisticRegression())
]

In [36]:
# Set up a directory to cache the pipeline results
from tempfile import mkdtemp
cachedir = mkdtemp()

# Instantiate the pipeline
my_pipe = Pipeline(estimators, memory=cachedir)

In [37]:
# logistic Regression
logreg_param_grid = {
    'scaler': [RobustScaler()],
    'model': [LogisticRegression(penalty = 'l1', random_state = 1)],
    'model__C': [0.1, 1, 10],
    'model__solver': ['liblinear', 'saga'],
    'model__max_iter': [10000]
}

In [38]:
# Instantiate the grid search for each model
logreg_grid = GridSearchCV(my_pipe, param_grid = logreg_param_grid, cv = 5, verbose = 2)

In [39]:
# Fit the grid search for all models
logreg_fitted_grid = logreg_grid.fit(X_train, y_train.values.ravel())

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END model=LogisticRegression(penalty='l1', random_state=1), model__C=0.1, model__max_iter=10000, model__solver=liblinear, scaler=RobustScaler(); total time=  16.8s
[CV] END model=LogisticRegression(penalty='l1', random_state=1), model__C=0.1, model__max_iter=10000, model__solver=liblinear, scaler=RobustScaler(); total time=  17.3s
[CV] END model=LogisticRegression(penalty='l1', random_state=1), model__C=0.1, model__max_iter=10000, model__solver=liblinear, scaler=RobustScaler(); total time=  18.3s
[CV] END model=LogisticRegression(penalty='l1', random_state=1), model__C=0.1, model__max_iter=10000, model__solver=liblinear, scaler=RobustScaler(); total time=  16.2s
[CV] END model=LogisticRegression(penalty='l1', random_state=1), model__C=0.1, model__max_iter=10000, model__solver=liblinear, scaler=RobustScaler(); total time=  16.0s
[CV] END model=LogisticRegression(penalty='l1', random_state=1), model__C=0.1, model__max_iter=

In [None]:
# SVM
svm_param_grid = {
    'scaler': [RobustScaler()],
    'model': [LinearSVC(random_state = 1)],
    'model__C': [0.01, 0.1, 1],
    'model__penalty': ['l1', 'l2'],
    'model__max_iter': [10000]
}

In [None]:
# Instantiate the grid search for each model
svm_grid = GridSearchCV(my_pipe, param_grid =svm_param_grid, cv = 5, verbose = 2)

In [None]:
# Fit the grid search for all models
svm_fitted_grid = svm_grid.fit(X_train, y_train.values.ravel())

In [None]:
# Random Forest
rf_param_grid = {
    'scaler': [RobustScaler()],
    'model': [RandomForestClassifier(random_state = 1)],
    'model__n_estimators': range(5, 10),
    'model__max_depth': range(10, 20, 2),
    'model__max_iter': [10000]
}

In [None]:
# Instantiate the grid search for each model
rf_grid = GridSearchCV(my_pipe, param_grid=rf_param_grid, cv = 5, verbose = 2)

In [None]:
# Fit the grid search 
rf_fitted_grid = rf_grid.fit(X_train, y_train.values.ravel())

In [None]:
# XGBoost
xgb_param_grid = {
    'scaler': [RobustScaler()],
    'model': [XGBClassifier(random_state = 1)],
    'model__n_estimators': range(50, 53, 2),
    'model__max_depth': range(8, 11, 2),
    'model__learning_rate': [0.5, 1],
    'model__max_iter': [10000]
}

In [None]:
# Instantiate the grid search for each model
xgb_grid = GridSearchCV(my_pipe, param_grid =xgb_param_grid, cv = 5, verbose = 2)

In [None]:
# Fit the grid search for all models
xgb_fitted_grid = xgb_grid.fit(X_train, y_train.values.ravel())