# Classification Models for Tweets

### Importing Libraries

In [44]:
import numpy as np
import _pickle as pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import GridSearchCV

### Loading in the Data

In [45]:
# Loading in the DF
with open("main_df.pkl",'rb') as fp:
    main_df = pickle.load(fp)

# Loading in the cleaned tweet data
with open("clean_tweets.pkl",'rb') as fp:
    data = pickle.load(fp)

## Preparing the Data
### Train, test, split

In [46]:
X, y = data, main_df.City

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

tf_idf_data_train = vectorizer.fit_transform(X_train)

tf_idf_data_test = vectorizer.transform(X_test)

## Classifier Models
* Dummy Classifier - baseline
* Naive Bayes
* Random Forest
* Logistic Regression

### Dummy Classifier - Baseline Model

In [48]:
from sklearn.dummy import DummyClassifier

dm_class = DummyClassifier()

dm_class.fit(tf_idf_data_train, y_train)
dm_train_preds = dm_class.predict(tf_idf_data_train)
dm_test_preds = dm_class.predict(tf_idf_data_test)

dm_train_score = accuracy_score(y_train, dm_train_preds)
dm_test_score = accuracy_score(y_test, dm_test_preds)

print('Dummy Classifier')
print(f"Training Accuracy: {dm_train_score} \t\t Testing Accuracy: {dm_test_score}")

Dummy Classifier
Training Accuracy: 0.4976875 		 Testing Accuracy: 0.49875


### Random Forest

In [49]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100)

rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

Random Forest
Training Accuracy: 0.9704 		 Testing Accuracy: 0.6332


#### Random Forest with Grid Search

In [56]:
# Setting the Parameters to be tested
rf_param_grid = {'n_estimators': [10,30, 60,100],
                 'criterion': ['gini', 'entropy'],
                 'max_depth': [None, 2, 5, 10],
                 'min_samples_split': [5,10],
                 'min_samples_leaf': [1, 2, 5]}

In [None]:
# Grid Searching
rf_grid_search = GridSearchCV(rf_classifier, rf_param_grid, cv=3, return_train_score=True, verbose=2)
rf_grid_search.fit(tf_idf_data_train, y_train)

Fitting 3 folds for each of 192 candidates, totalling 576 fits
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10, total=   3.4s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.6s remaining:    0.0s


[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10, total=   3.3s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10, total=   3.4s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30, total=  10.5s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30, total=   9.8s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30, total=  10.0s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_s

[CV]  criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=30, total=   6.9s
[CV] criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60, total=  13.5s
[CV] criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60, total=  13.6s
[CV] criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60, total=  13.2s
[CV] criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100, total=  22.7s
[CV] criterion=gini, max_depth=None, min_samples_leaf=2, mi

[CV]  criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=100, total=   0.4s
[CV] criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=100 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=100, total=   0.4s
[CV] criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=100 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=100, total=   0.4s
[CV] criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=10, n_estimators=10 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=10, n_estimators=10, total=   0.1s
[CV] criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=10, n_estimators=10 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=10, n_estimators=10, total=   0.1s
[CV] criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=10, n_estimators

[CV]  criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=30, total=   0.2s
[CV] criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=30 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=30, total=   0.1s
[CV] criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=30 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=30, total=   0.2s
[CV] criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60, total=   0.3s
[CV] criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60, total=   0.3s
[CV] criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60 
[CV] 

[CV]  criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=60, total=   0.5s
[CV] criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100 
[CV]  criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100, total=   0.9s
[CV] criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100 
[CV]  criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100, total=   0.8s
[CV] criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100 
[CV]  criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100, total=   0.9s
[CV] criterion=gini, max_depth=5, min_samples_leaf=2, min_samples_split=5, n_estimators=10 
[CV]  criterion=gini, max_depth=5, min_samples_leaf=2, min_samples_split=5, n_estimators=10, total=   0.1s
[CV] criterion=gini, max_depth=5, min_samples_leaf=2, min_samples_split=5, n_estimat

[CV]  criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=10, total=   0.1s
[CV] criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=10 
[CV]  criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=10, total=   0.1s
[CV] criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=30, total=   0.3s
[CV] criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=30, total=   0.3s
[CV] criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=30, total=   0.3s
[CV] criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators

[CV]  criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=60, total=   0.9s
[CV] criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=60 
[CV]  criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=60, total=   0.9s
[CV] criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=60 
[CV]  criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=60, total=   0.9s
[CV] criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100 
[CV]  criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100, total=   1.5s
[CV] criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100 
[CV]  criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100, total=   1.5s
[CV] criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estima

[CV]  criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=10, n_estimators=100, total=   1.4s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10, total=   3.7s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10, total=   3.6s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10, total=   3.4s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30, total=  10.3s
[CV] criterion=entropy, max_depth=None, min_sa

[CV]  criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=10, total=   2.9s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=30, total=   8.8s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=30, total=   8.3s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=30, total=   8.4s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60, total=  16.6s
[CV] criterion=entropy, max_depth=

In [None]:
rf_grid_search = GridSearchCV(rf_classifier, rf_param_grid, cv=3, return_train_score=True, verbose=2)
rf_grid_search.fit(tf_idf_data_train, y_train)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_leaf=1, min_samples_split=10)

rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print('Random Forest with GridSearch')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

### Naive Bayes

In [50]:
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()

nb_classifier.fit(tf_idf_data_train, y_train)
nb_train_preds = nb_classifier.predict(tf_idf_data_train)
nb_test_preds = nb_classifier.predict(tf_idf_data_test)

nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))

Multinomial Naive Bayes
Training Accuracy: 0.7955 		 Testing Accuracy: 0.6182


### Logistic Regression

In [51]:
from sklearn.linear_model import LogisticRegression

lr_class = LogisticRegression()

lr_class.fit(tf_idf_data_train, y_train)
lr_train_preds = lr_class.predict(tf_idf_data_train)
lr_test_preds = lr_class.predict(tf_idf_data_test)

lr_train_score = accuracy_score(y_train, lr_train_preds)
lr_test_score = accuracy_score(y_test, lr_test_preds)

print('Logistic Regression')
print(f"Training Accuracy: {lr_train_score} \t\t Testing Accuracy: {lr_test_score}")

Logistic Regression
Training Accuracy: 0.7844375 		 Testing Accuracy: 0.63175




#### Logistic Regression with Grid Search

In [52]:
# parameters to grid search
log_param_grid = {'C': [1.5**n for n in range(0, 20, 2)],
                  'fit_intercept': [True, False],
                  'intercept_scaling': [1, 5, 10, 25, 50, 100],
                  'solver': ['liblinear', 'saga']}

In [53]:
log_grid_search = GridSearchCV(lr_class, log_param_grid, cv=3, return_train_score=True, verbose=3)
log_grid_search.fit(tf_idf_data_train, y_train)

Fitting 3 folds for each of 240 candidates, totalling 720 fits
[CV] C=1.0, fit_intercept=True, intercept_scaling=1, solver=liblinear 
[CV]  C=1.0, fit_intercept=True, intercept_scaling=1, solver=liblinear, score=0.6130483689538808, total=   0.0s
[CV] C=1.0, fit_intercept=True, intercept_scaling=1, solver=liblinear 
[CV]  C=1.0, fit_intercept=True, intercept_scaling=1, solver=liblinear, score=0.6169135570973185, total=   0.0s
[CV] C=1.0, fit_intercept=True, intercept_scaling=1, solver=liblinear 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV]  C=1.0, fit_intercept=True, intercept_scaling=1, solver=liblinear, score=0.6097881117569848, total=   0.1s
[CV] C=1.0, fit_intercept=True, intercept_scaling=1, solver=saga .....
[CV]  C=1.0, fit_intercept=True, intercept_scaling=1, solver=saga, score=0.6128608923884514, total=   0.1s
[CV] C=1.0, fit_intercept=True, intercept_scaling=1, solver=saga .....
[CV]  C=1.0, fit_intercept=True, intercept_scaling=1, solver=saga, score=0.6169135570973185, total=   0.1s
[CV] C=1.0, fit_intercept=True, intercept_scaling=1, solver=saga .....
[CV]  C=1.0, fit_intercept=True, intercept_scaling=1, solver=saga, score=0.6097881117569848, total=   0.1s
[CV] C=1.0, fit_intercept=True, intercept_scaling=5, solver=liblinear 
[CV]  C=1.0, fit_intercept=True, intercept_scaling=5, solver=liblinear, score=0.6128608923884514, total=   0.1s
[CV] C=1.0, fit_intercept=True, intercept_scaling=5, solver=liblinear 
[CV]  C=1.0, fit_intercept=True, intercept_scaling=5, solver=liblinear, score=0.6169135570973185, to

[CV]  C=1.0, fit_intercept=False, intercept_scaling=10, solver=saga, score=0.5997375328083989, total=   0.1s
[CV] C=1.0, fit_intercept=False, intercept_scaling=10, solver=saga ...
[CV]  C=1.0, fit_intercept=False, intercept_scaling=10, solver=saga, score=0.6111006937933621, total=   0.1s
[CV] C=1.0, fit_intercept=False, intercept_scaling=10, solver=saga ...
[CV]  C=1.0, fit_intercept=False, intercept_scaling=10, solver=saga, score=0.5915994749671855, total=   0.1s
[CV] C=1.0, fit_intercept=False, intercept_scaling=25, solver=liblinear 
[CV]  C=1.0, fit_intercept=False, intercept_scaling=25, solver=liblinear, score=0.5997375328083989, total=   0.0s
[CV] C=1.0, fit_intercept=False, intercept_scaling=25, solver=liblinear 
[CV]  C=1.0, fit_intercept=False, intercept_scaling=25, solver=liblinear, score=0.6111006937933621, total=   0.0s
[CV] C=1.0, fit_intercept=False, intercept_scaling=25, solver=liblinear 
[CV]  C=1.0, fit_intercept=False, intercept_scaling=25, solver=liblinear, score=0.59

[CV]  C=2.25, fit_intercept=True, intercept_scaling=50, solver=liblinear, score=0.6139133695855991, total=   0.1s
[CV] C=2.25, fit_intercept=True, intercept_scaling=50, solver=saga ...
[CV]  C=2.25, fit_intercept=True, intercept_scaling=50, solver=saga, score=0.6115485564304461, total=   0.1s
[CV] C=2.25, fit_intercept=True, intercept_scaling=50, solver=saga ...
[CV]  C=2.25, fit_intercept=True, intercept_scaling=50, solver=saga, score=0.6157884867804237, total=   0.1s
[CV] C=2.25, fit_intercept=True, intercept_scaling=50, solver=saga ...
[CV]  C=2.25, fit_intercept=True, intercept_scaling=50, solver=saga, score=0.6139133695855991, total=   0.1s
[CV] C=2.25, fit_intercept=True, intercept_scaling=100, solver=liblinear 
[CV]  C=2.25, fit_intercept=True, intercept_scaling=100, solver=liblinear, score=0.6117360329958755, total=   0.1s
[CV] C=2.25, fit_intercept=True, intercept_scaling=100, solver=liblinear 
[CV]  C=2.25, fit_intercept=True, intercept_scaling=100, solver=liblinear, score=0.

[CV]  C=2.25, fit_intercept=False, intercept_scaling=100, solver=saga, score=0.59422463903994, total=   0.1s
[CV] C=5.0625, fit_intercept=True, intercept_scaling=1, solver=liblinear 
[CV]  C=5.0625, fit_intercept=True, intercept_scaling=1, solver=liblinear, score=0.6051743532058492, total=   0.1s
[CV] C=5.0625, fit_intercept=True, intercept_scaling=1, solver=liblinear 
[CV]  C=5.0625, fit_intercept=True, intercept_scaling=1, solver=liblinear, score=0.6133508344271517, total=   0.1s
[CV] C=5.0625, fit_intercept=True, intercept_scaling=1, solver=liblinear 
[CV]  C=5.0625, fit_intercept=True, intercept_scaling=1, solver=liblinear, score=0.60866304144009, total=   0.1s
[CV] C=5.0625, fit_intercept=True, intercept_scaling=1, solver=saga ..
[CV]  C=5.0625, fit_intercept=True, intercept_scaling=1, solver=saga, score=0.6053618297712786, total=   0.2s
[CV] C=5.0625, fit_intercept=True, intercept_scaling=1, solver=saga ..
[CV]  C=5.0625, fit_intercept=True, intercept_scaling=1, solver=saga, scor

[CV]  C=5.0625, fit_intercept=False, intercept_scaling=5, solver=saga, score=0.595050618672666, total=   0.1s
[CV] C=5.0625, fit_intercept=False, intercept_scaling=5, solver=saga .
[CV]  C=5.0625, fit_intercept=False, intercept_scaling=5, solver=saga, score=0.6067879242452653, total=   0.1s
[CV] C=5.0625, fit_intercept=False, intercept_scaling=5, solver=saga .
[CV]  C=5.0625, fit_intercept=False, intercept_scaling=5, solver=saga, score=0.5910369398087381, total=   0.2s
[CV] C=5.0625, fit_intercept=False, intercept_scaling=10, solver=liblinear 
[CV]  C=5.0625, fit_intercept=False, intercept_scaling=10, solver=liblinear, score=0.5948631421072366, total=   0.1s
[CV] C=5.0625, fit_intercept=False, intercept_scaling=10, solver=liblinear 
[CV]  C=5.0625, fit_intercept=False, intercept_scaling=10, solver=liblinear, score=0.6067879242452653, total=   0.1s
[CV] C=5.0625, fit_intercept=False, intercept_scaling=10, solver=liblinear 
[CV]  C=5.0625, fit_intercept=False, intercept_scaling=10, solve

[CV]  C=11.390625, fit_intercept=True, intercept_scaling=10, solver=saga, score=0.6039752484530283, total=   0.3s
[CV] C=11.390625, fit_intercept=True, intercept_scaling=25, solver=liblinear 
[CV]  C=11.390625, fit_intercept=True, intercept_scaling=25, solver=liblinear, score=0.6017997750281214, total=   0.1s
[CV] C=11.390625, fit_intercept=True, intercept_scaling=25, solver=liblinear 
[CV]  C=11.390625, fit_intercept=True, intercept_scaling=25, solver=liblinear, score=0.6052878304894056, total=   0.1s
[CV] C=11.390625, fit_intercept=True, intercept_scaling=25, solver=liblinear 
[CV]  C=11.390625, fit_intercept=True, intercept_scaling=25, solver=liblinear, score=0.6037877367335458, total=   0.1s
[CV] C=11.390625, fit_intercept=True, intercept_scaling=25, solver=saga 
[CV]  C=11.390625, fit_intercept=True, intercept_scaling=25, solver=saga, score=0.6021747281589801, total=   0.3s
[CV] C=11.390625, fit_intercept=True, intercept_scaling=25, solver=saga 
[CV]  C=11.390625, fit_intercept=Tr

[CV]  C=11.390625, fit_intercept=False, intercept_scaling=50, solver=liblinear, score=0.5869116819801238, total=   0.1s
[CV] C=11.390625, fit_intercept=False, intercept_scaling=50, solver=saga 
[CV]  C=11.390625, fit_intercept=False, intercept_scaling=50, solver=saga, score=0.5888638920134983, total=   0.2s
[CV] C=11.390625, fit_intercept=False, intercept_scaling=50, solver=saga 
[CV]  C=11.390625, fit_intercept=False, intercept_scaling=50, solver=saga, score=0.5992874554659666, total=   0.2s
[CV] C=11.390625, fit_intercept=False, intercept_scaling=50, solver=saga 
[CV]  C=11.390625, fit_intercept=False, intercept_scaling=50, solver=saga, score=0.5869116819801238, total=   0.2s
[CV] C=11.390625, fit_intercept=False, intercept_scaling=100, solver=liblinear 
[CV]  C=11.390625, fit_intercept=False, intercept_scaling=100, solver=liblinear, score=0.5890513685789276, total=   0.1s
[CV] C=11.390625, fit_intercept=False, intercept_scaling=100, solver=liblinear 
[CV]  C=11.390625, fit_intercept



[CV]  C=25.62890625, fit_intercept=True, intercept_scaling=1, solver=saga, score=0.5958005249343832, total=   0.4s
[CV] C=25.62890625, fit_intercept=True, intercept_scaling=1, solver=saga 
[CV]  C=25.62890625, fit_intercept=True, intercept_scaling=1, solver=saga, score=0.6013500843802738, total=   0.4s
[CV] C=25.62890625, fit_intercept=True, intercept_scaling=1, solver=saga 
[CV]  C=25.62890625, fit_intercept=True, intercept_scaling=1, solver=saga, score=0.5990999437464841, total=   0.4s
[CV] C=25.62890625, fit_intercept=True, intercept_scaling=5, solver=liblinear 
[CV]  C=25.62890625, fit_intercept=True, intercept_scaling=5, solver=liblinear, score=0.5952380952380952, total=   0.1s
[CV] C=25.62890625, fit_intercept=True, intercept_scaling=5, solver=liblinear 
[CV]  C=25.62890625, fit_intercept=True, intercept_scaling=5, solver=liblinear, score=0.6007875492218263, total=   0.1s
[CV] C=25.62890625, fit_intercept=True, intercept_scaling=5, solver=liblinear 
[CV]  C=25.62890625, fit_inter

[CV]  C=25.62890625, fit_intercept=False, intercept_scaling=5, solver=saga, score=0.5884889388826396, total=   0.3s
[CV] C=25.62890625, fit_intercept=False, intercept_scaling=5, solver=saga 
[CV]  C=25.62890625, fit_intercept=False, intercept_scaling=5, solver=saga, score=0.5927245452840803, total=   0.3s
[CV] C=25.62890625, fit_intercept=False, intercept_scaling=5, solver=saga 
[CV]  C=25.62890625, fit_intercept=False, intercept_scaling=5, solver=saga, score=0.5818488655540971, total=   0.3s
[CV] C=25.62890625, fit_intercept=False, intercept_scaling=10, solver=liblinear 
[CV]  C=25.62890625, fit_intercept=False, intercept_scaling=10, solver=liblinear, score=0.5884889388826396, total=   0.1s
[CV] C=25.62890625, fit_intercept=False, intercept_scaling=10, solver=liblinear 
[CV]  C=25.62890625, fit_intercept=False, intercept_scaling=10, solver=liblinear, score=0.5925370335645977, total=   0.1s
[CV] C=25.62890625, fit_intercept=False, intercept_scaling=10, solver=liblinear 
[CV]  C=25.6289

[CV]  C=57.6650390625, fit_intercept=True, intercept_scaling=10, solver=saga, score=0.5913010873640795, total=   0.4s
[CV] C=57.6650390625, fit_intercept=True, intercept_scaling=10, solver=saga 
[CV]  C=57.6650390625, fit_intercept=True, intercept_scaling=10, solver=saga, score=0.5985374085880367, total=   0.4s
[CV] C=57.6650390625, fit_intercept=True, intercept_scaling=10, solver=saga 
[CV]  C=57.6650390625, fit_intercept=True, intercept_scaling=10, solver=saga, score=0.5960997562347646, total=   0.4s
[CV] C=57.6650390625, fit_intercept=True, intercept_scaling=25, solver=liblinear 
[CV]  C=57.6650390625, fit_intercept=True, intercept_scaling=25, solver=liblinear, score=0.5894263217097863, total=   0.1s
[CV] C=57.6650390625, fit_intercept=True, intercept_scaling=25, solver=liblinear 
[CV]  C=57.6650390625, fit_intercept=True, intercept_scaling=25, solver=liblinear, score=0.5985374085880367, total=   0.2s
[CV] C=57.6650390625, fit_intercept=True, intercept_scaling=25, solver=liblinear 


[CV]  C=57.6650390625, fit_intercept=False, intercept_scaling=25, solver=saga, score=0.5811773528308961, total=   0.4s
[CV] C=57.6650390625, fit_intercept=False, intercept_scaling=25, solver=saga 
[CV]  C=57.6650390625, fit_intercept=False, intercept_scaling=25, solver=saga, score=0.5870991936996062, total=   0.4s
[CV] C=57.6650390625, fit_intercept=False, intercept_scaling=25, solver=saga 
[CV]  C=57.6650390625, fit_intercept=False, intercept_scaling=25, solver=saga, score=0.5779111194449653, total=   0.4s
[CV] C=57.6650390625, fit_intercept=False, intercept_scaling=50, solver=liblinear 
[CV]  C=57.6650390625, fit_intercept=False, intercept_scaling=50, solver=liblinear, score=0.5811773528308961, total=   0.1s
[CV] C=57.6650390625, fit_intercept=False, intercept_scaling=50, solver=liblinear 
[CV]  C=57.6650390625, fit_intercept=False, intercept_scaling=50, solver=liblinear, score=0.5869116819801238, total=   0.1s
[CV] C=57.6650390625, fit_intercept=False, intercept_scaling=50, solver=l

[CV]  C=129.746337890625, fit_intercept=True, intercept_scaling=50, solver=liblinear, score=0.5889743108944309, total=   0.2s
[CV] C=129.746337890625, fit_intercept=True, intercept_scaling=50, solver=saga 
[CV]  C=129.746337890625, fit_intercept=True, intercept_scaling=50, solver=saga, score=0.5905511811023622, total=   0.4s
[CV] C=129.746337890625, fit_intercept=True, intercept_scaling=50, solver=saga 
[CV]  C=129.746337890625, fit_intercept=True, intercept_scaling=50, solver=saga, score=0.59422463903994, total=   0.4s
[CV] C=129.746337890625, fit_intercept=True, intercept_scaling=50, solver=saga 
[CV]  C=129.746337890625, fit_intercept=True, intercept_scaling=50, solver=saga, score=0.5925370335645977, total=   0.4s
[CV] C=129.746337890625, fit_intercept=True, intercept_scaling=100, solver=liblinear 
[CV]  C=129.746337890625, fit_intercept=True, intercept_scaling=100, solver=liblinear, score=0.5883014623172104, total=   0.2s
[CV] C=129.746337890625, fit_intercept=True, intercept_scali

[CV]  C=129.746337890625, fit_intercept=False, intercept_scaling=100, solver=liblinear, score=0.5835364710294393, total=   0.2s
[CV] C=129.746337890625, fit_intercept=False, intercept_scaling=100, solver=liblinear 
[CV]  C=129.746337890625, fit_intercept=False, intercept_scaling=100, solver=liblinear, score=0.573785861616351, total=   0.2s
[CV] C=129.746337890625, fit_intercept=False, intercept_scaling=100, solver=saga 
[CV]  C=129.746337890625, fit_intercept=False, intercept_scaling=100, solver=saga, score=0.5793025871766029, total=   0.4s
[CV] C=129.746337890625, fit_intercept=False, intercept_scaling=100, solver=saga 
[CV]  C=129.746337890625, fit_intercept=False, intercept_scaling=100, solver=saga, score=0.5839114944684043, total=   0.4s
[CV] C=129.746337890625, fit_intercept=False, intercept_scaling=100, solver=saga 
[CV]  C=129.746337890625, fit_intercept=False, intercept_scaling=100, solver=saga, score=0.5764110256891055, total=   0.4s
[CV] C=291.92926025390625, fit_intercept=Tr

[CV]  C=291.92926025390625, fit_intercept=True, intercept_scaling=100, solver=saga, score=0.5929120570035628, total=   0.5s
[CV] C=291.92926025390625, fit_intercept=False, intercept_scaling=1, solver=liblinear 
[CV]  C=291.92926025390625, fit_intercept=False, intercept_scaling=1, solver=liblinear, score=0.5744281964754405, total=   0.2s
[CV] C=291.92926025390625, fit_intercept=False, intercept_scaling=1, solver=liblinear 
[CV]  C=291.92926025390625, fit_intercept=False, intercept_scaling=1, solver=liblinear, score=0.5779111194449653, total=   0.2s
[CV] C=291.92926025390625, fit_intercept=False, intercept_scaling=1, solver=liblinear 
[CV]  C=291.92926025390625, fit_intercept=False, intercept_scaling=1, solver=liblinear, score=0.5707856741046315, total=   0.3s
[CV] C=291.92926025390625, fit_intercept=False, intercept_scaling=1, solver=saga 
[CV]  C=291.92926025390625, fit_intercept=False, intercept_scaling=1, solver=saga, score=0.577615298087739, total=   0.5s
[CV] C=291.92926025390625, 

[CV]  C=656.8408355712891, fit_intercept=True, intercept_scaling=1, solver=liblinear, score=0.5812863303956497, total=   0.5s
[CV] C=656.8408355712891, fit_intercept=True, intercept_scaling=1, solver=saga 
[CV]  C=656.8408355712891, fit_intercept=True, intercept_scaling=1, solver=saga, score=0.5871766029246345, total=   0.7s
[CV] C=656.8408355712891, fit_intercept=True, intercept_scaling=1, solver=saga 
[CV]  C=656.8408355712891, fit_intercept=True, intercept_scaling=1, solver=saga, score=0.5923495218451154, total=   0.5s
[CV] C=656.8408355712891, fit_intercept=True, intercept_scaling=1, solver=saga 
[CV]  C=656.8408355712891, fit_intercept=True, intercept_scaling=1, solver=saga, score=0.591411963247703, total=   0.4s
[CV] C=656.8408355712891, fit_intercept=True, intercept_scaling=5, solver=liblinear 
[CV]  C=656.8408355712891, fit_intercept=True, intercept_scaling=5, solver=liblinear, score=0.5770528683914511, total=   0.6s
[CV] C=656.8408355712891, fit_intercept=True, intercept_scali

[CV]  C=656.8408355712891, fit_intercept=False, intercept_scaling=5, solver=liblinear, score=0.5652418447694039, total=   0.3s
[CV] C=656.8408355712891, fit_intercept=False, intercept_scaling=5, solver=liblinear 
[CV]  C=656.8408355712891, fit_intercept=False, intercept_scaling=5, solver=liblinear, score=0.5782861428839302, total=   0.3s
[CV] C=656.8408355712891, fit_intercept=False, intercept_scaling=5, solver=liblinear 
[CV]  C=656.8408355712891, fit_intercept=False, intercept_scaling=5, solver=liblinear, score=0.5655353459591225, total=   0.3s
[CV] C=656.8408355712891, fit_intercept=False, intercept_scaling=5, solver=saga 
[CV]  C=656.8408355712891, fit_intercept=False, intercept_scaling=5, solver=saga, score=0.5761154855643045, total=   0.5s
[CV] C=656.8408355712891, fit_intercept=False, intercept_scaling=5, solver=saga 
[CV]  C=656.8408355712891, fit_intercept=False, intercept_scaling=5, solver=saga, score=0.5814738421151322, total=   0.5s
[CV] C=656.8408355712891, fit_intercept=F

[CV]  C=1477.8918800354004, fit_intercept=True, intercept_scaling=5, solver=saga, score=0.5919744984061504, total=   0.4s
[CV] C=1477.8918800354004, fit_intercept=True, intercept_scaling=5, solver=saga 
[CV]  C=1477.8918800354004, fit_intercept=True, intercept_scaling=5, solver=saga, score=0.5906619163697732, total=   0.4s
[CV] C=1477.8918800354004, fit_intercept=True, intercept_scaling=10, solver=liblinear 
[CV]  C=1477.8918800354004, fit_intercept=True, intercept_scaling=10, solver=liblinear, score=0.577615298087739, total=   0.5s
[CV] C=1477.8918800354004, fit_intercept=True, intercept_scaling=10, solver=liblinear 
[CV]  C=1477.8918800354004, fit_intercept=True, intercept_scaling=10, solver=liblinear, score=0.5855990999437465, total=   0.4s
[CV] C=1477.8918800354004, fit_intercept=True, intercept_scaling=10, solver=liblinear 
[CV]  C=1477.8918800354004, fit_intercept=True, intercept_scaling=10, solver=liblinear, score=0.5756609788111757, total=   0.5s
[CV] C=1477.8918800354004, fit_

[CV]  C=1477.8918800354004, fit_intercept=False, intercept_scaling=10, solver=liblinear, score=0.5585974123382711, total=   0.4s
[CV] C=1477.8918800354004, fit_intercept=False, intercept_scaling=10, solver=saga 
[CV]  C=1477.8918800354004, fit_intercept=False, intercept_scaling=10, solver=saga, score=0.5757405324334458, total=   0.4s
[CV] C=1477.8918800354004, fit_intercept=False, intercept_scaling=10, solver=saga 
[CV]  C=1477.8918800354004, fit_intercept=False, intercept_scaling=10, solver=saga, score=0.5784736546034127, total=   0.4s
[CV] C=1477.8918800354004, fit_intercept=False, intercept_scaling=10, solver=saga 
[CV]  C=1477.8918800354004, fit_intercept=False, intercept_scaling=10, solver=saga, score=0.5732233264579036, total=   0.4s
[CV] C=1477.8918800354004, fit_intercept=False, intercept_scaling=25, solver=liblinear 
[CV]  C=1477.8918800354004, fit_intercept=False, intercept_scaling=25, solver=liblinear, score=0.567116610423697, total=   0.4s
[CV] C=1477.8918800354004, fit_int

[Parallel(n_jobs=1)]: Done 720 out of 720 | elapsed:  3.5min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1.0, 2.25, 5.0625, 11.390625, 25.62890625, 57.6650390625, 129.746337890625, 291.92926025390625, 656.8408355712891, 1477.8918800354004], 'fit_intercept': [True, False], 'intercept_scaling': [1, 5, 10, 25, 50, 100], 'solver': ['liblinear', 'saga']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [54]:
print(f"Testing Accuracy: {log_grid_search.best_score_*100}")
print(f"Optimal Parameters: {log_grid_search.best_params_}")

Testing Accuracy: 61.38125
Optimal Parameters: {'C': 2.25, 'fit_intercept': True, 'intercept_scaling': 1, 'solver': 'liblinear'}


In [55]:
lr_class = log_grid_search.best_estimator_

lr_class.fit(tf_idf_data_train, y_train)

lr_train_preds = lr_class.predict(tf_idf_data_train)
lr_test_preds = lr_class.predict(tf_idf_data_test)

lr_train_score = accuracy_score(y_train, lr_train_preds)
lr_test_score = accuracy_score(y_test, lr_test_preds)

print('Logistic Regression with GridSearch')
print(f"Training Accuracy: {lr_train_score} \t\t Testing Accuracy: {lr_test_score}")

Logistic Regression with GridSearch
Training Accuracy: 0.8189375 		 Testing Accuracy: 0.63275


## Deep Learning and Keras

In [13]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence

Using TensorFlow backend.


In [15]:
y = pd.get_dummies(y).values

In [None]:
# tokenizer = text.Tokenizer(num_words=20000)
# tokenizer.fit_on_texts(list(main_df.tweet))
# list_tokenized_headlines = tokenizer.texts_to_sequences(main_df.tweet)
# X_t = sequence.pad_sequences(list_tokenized_headlines, maxlen=100)

In [33]:
tokenizer = text.Tokenizer(num_words=1000)
tokenizer.fit_on_texts(X)
list_tokenized_tweets = tokenizer.texts_to_sequences(X)
X_t = sequence.pad_sequences(list_tokenized_tweets, maxlen=100)

In [40]:
embedding_size = 500
input_ = Input(shape=(100,))
x = Embedding(1000, embedding_size)(input_)
x = LSTM(50, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.5)(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.5)(x)
# There are 2 different possible classes, so we use 2 neurons in our output layer
x = Dense(2, activation='sigmoid')(x)

model = Model(inputs=input_, outputs=x)

In [41]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [42]:
model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 100, 500)          500000    
_________________________________________________________________
lstm_5 (LSTM)                (None, 100, 50)           110200    
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 50)                0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 50)                2550      
_________________________________________________________________
dropout_10 (Dropout)         (None, 50)                0   

In [43]:
model.fit(X_t, y, epochs=10, batch_size=2000, validation_split=0.2)

Train on 8006 samples, validate on 2002 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1a3d9eeac8>