In [13]:
# Usual imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt

# Python's OS Package
import os

# sklearn imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import stop_words
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# nltk imports
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

ModuleNotFoundError: No module named 'scikitplot'

# Importing Data

In [2]:
df = pd.read_csv('nov2.1_df.csv')
df.drop(columns='Unnamed: 0', inplace=True)

In [3]:
df.head()

Unnamed: 0,target,text,time,sia_positive,sia_negative,sia_neutral,sia_compound
0,0,RT @JulianCastro: My grandmother was a domesti...,10/29/2019 22:56,0.0,0.0,1.0,0.0
1,1,RT @MayorOfLA: #GettyFire update | 8AM:\n\n- 5...,10/29/2019 22:56,0.0,0.0,1.0,0.0
2,1,Getty Fire Ignited by Power Line in Sepulveda ...,10/29/2019 22:56,0.0,0.211,0.789,-0.34
3,1,"RT @latimes: In an ominous new warning, the Na...",10/29/2019 22:56,0.0,0.202,0.798,-0.5859
4,1,Arson investigators from the Los Angeles Fire ...,10/29/2019 22:56,0.066,0.122,0.812,-0.2732


In [4]:
df['mention'] = np.where(df['text'].str.contains('@'), 1, 0)
df['https'] = np.where(df['text'].str.contains('http'), 1,0)
df['RT'] = np.where(df['text'].str.contains('RT'), 1,0)
df.head()

Unnamed: 0,target,text,time,sia_positive,sia_negative,sia_neutral,sia_compound,mention,https,RT
0,0,RT @JulianCastro: My grandmother was a domesti...,10/29/2019 22:56,0.0,0.0,1.0,0.0,1,0,1
1,1,RT @MayorOfLA: #GettyFire update | 8AM:\n\n- 5...,10/29/2019 22:56,0.0,0.0,1.0,0.0,1,0,1
2,1,Getty Fire Ignited by Power Line in Sepulveda ...,10/29/2019 22:56,0.0,0.211,0.789,-0.34,0,1,0
3,1,"RT @latimes: In an ominous new warning, the Na...",10/29/2019 22:56,0.0,0.202,0.798,-0.5859,1,0,1
4,1,Arson investigators from the Los Angeles Fire ...,10/29/2019 22:56,0.066,0.122,0.812,-0.2732,0,1,0


In [5]:
df['target'].value_counts(normalize = True)

0    0.766858
1    0.233142
Name: target, dtype: float64

## X/Y Values

In [6]:
X = df['text']
y = df['target']

## Train Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=248)

In [8]:
custom_stop = list(ENGLISH_STOP_WORDS)
custom_stop.extend(["b'RT",
                    "000",
                    "x80",
                    "x82"
                    "x8f",
                    "x99",
                    "x94",
                    "x98",
                    "x99m", 
                    "x99s",
                    "x9d",
                    "x9f",
                    "xa5",
                    "xa6",
                    "xa6'RT",
                    "xa6'b'RT",
                    "xa6'b'",
                    "xb8",
                    "xe2",
                    "xef",
                    "xf0",
                    "amp",
                    'angele',
                    'angeles',
                    "b'",
                    "Center",
                    'center',
                    'com',
                    "Getty",
                    'getty',
                    'gettyfire',
                    "Getty Center",
                    "GettyFire",
                    "instagram"
                    "htpps",
                    "http",
                    "nhttp", 
                    "nhttps",
                    'los',
                    "Los Angeles",
                    "Los Angele",
                    "Los", "Angele",
                    "outfit",
                    'rt',
                    "taco",
                    "truck",
                    "taco truck",
                    "www"
                    
                    
            ])

## Model

In [9]:
pipe_1 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression(solver = 'lbfgs'))
])

pipe_2 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

pipe_3 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('bt', BaggingClassifier())
])

pipe_4 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knn', KNeighborsClassifier())
])

pipe_ngram = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(solver = 'lbfgs'))
])

In [10]:
pipe_1_params = {
    'tfidf__stop_words': [custom_stop],
    'tfidf__max_features': [300, 500, 1000],
    'tfidf__min_df': [10, 25, 50],
    'tfidf__max_df': [0.7, 0.8, 0.9],
    'lr__C': [0.001, 0.01, 1]
}

pipe_2_params = {
    'tfidf__stop_words': [custom_stop],
    'tfidf__max_features': [300, 500, 1000],
    'tfidf__min_df': [10, 25, 50],
    'tfidf__max_df': [0.7, 0.8, 0.9],
    'rf__max_depth' : [None, 1, 2],
#     'rf__min_samples_split' : [0.2, 0.4, 1],
    
}

pipe_3_params = {
    'tfidf__stop_words': [custom_stop],
    'tfidf__max_features': [1500, 1700, 2000],
    'tfidf__min_df': [35, 37, 42],
    'tfidf__max_df': [0.85, 0.9],
    'bt__n_estimators' : [1, 2, 3],
    'bt__max_samples' : [1, 2, 3],
    'bt__max_features' : [1, 2, 3],
}

pipe_4_params = {
    'tfidf__stop_words': [custom_stop],
    'tfidf__max_features': [1500, 1700, 2000],
    'tfidf__min_df': [35, 37, 42],
    'tfidf__max_df': [0.85, 0.9],
    'knn__n_neighbors': [3, 5, 7],
    'knn__leaf_size': [20, 30, 50],
#     'knn__p': [2, 3, 4],
}

param_ngram = {
    'cvec__stop_words': [custom_stop],
    'cvec__ngram_range': [(2,3), (3,3)]
}

In [11]:
gs_1 = GridSearchCV(pipe_1,
                   pipe_1_params,
                   cv = 5,
                   verbose = 1,
                   n_jobs=-1)
print('gs_1 completed')

gs_2 = GridSearchCV(pipe_2,
                   pipe_2_params,
                   cv = 5,
                   verbose = 1,
                   n_jobs=-1)
print('gs_2 completed')

gs_3 = GridSearchCV(pipe_3,
                   pipe_3_params,
                   cv = 5,
                   verbose = 1,
                   n_jobs=-1)
print('gs_3 completed')

gs_4 = GridSearchCV(pipe_4,
                   pipe_4_params,
                   cv = 5,
                   verbose = 1,
                   n_jobs=-1)
print('gs_4 completed')

gs_n_gram = GridSearchCV(pipe_ngram,
                        param_ngram,
                        cv = 5,
                        verbose = 1,
                        n_jobs=-1)
print('n_gram completed')

gs_1 completed
gs_2 completed
gs_3 completed
gs_4 completed
n_gram completed


In [12]:
gs_1.fit(X_train, y_train)
print(f'gs_1 best train score: {gs_1.best_score_}. gs_1 best test score: {gs_1.best_estimator_.score(X_test, y_test)}. The best train model from this grid search is {gs_1.best_estimator_}.')

gs_2.fit(X_train, y_train)
print(f'gs_2 best train score: {gs_2.best_score_}. gs_2 best test score: {gs_2.best_estimator_.score(X_test, y_test)}. The best model from this grid search is {gs_2.best_estimator_}')

gs_3.fit(X_train, y_train)
print(f'gs_3 best train score: {gs_3.best_score_}. gs_3 best test score: {gs_3.best_estimator_.score(X_test, y_test)}. The best model from this grid search is {gs_3.best_estimator_}')

gs_4.fit(X_train, y_train)
print(f'gs_4 best train score: {gs_4.best_score_}. gs_4 best test score: {gs_4.best_estimator_.score(X_test, y_test)}. The best model from this grid search is {gs_4.best_estimator_}')

gs_n_gram.fit(X_train, y_train)
print(f'gs_n_gram best train score: {gs_n_gram.best_score_}. gs_n_gram best test score {gs_n_gram.best_estimator_.score(X_test, y_test)}. The best model from this grid search is {gs_n_gram.best_estimator_}')

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  3.2min finished


gs_1 best train score: 0.9140765827279183. gs_1 best test score: 0.9116827789611667. The best train model from this grid search is Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.7, max_features=1000,
                                 min_df=10, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=['cant', 'name', 'whereby',
                                             'everywhere', 'whenever', 'or',
                                             'to...
                                 strip_accents=None, sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
   

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   37.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  3.9min finished


gs_2 best train score: 0.9155514284153602. gs_2 best test score: 0.9229886940848763. The best model from this grid search is Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.9, max_features=1000,
                                 min_df=10, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=['cant', 'name', 'whereby',
                                             'everywhere', 'whenever', 'or',
                                             'to...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 16.6min
[Parallel(n_jobs=-1)]: Done 2430 out of 2430 | elapsed: 21.8min finished


gs_3 best train score: 0.7697055771016551. gs_3 best test score: 0.766835982303785. The best model from this grid search is Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.85, max_features=1700,
                                 min_df=35, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=['cant', 'name', 'whereby',
                                             'everywhere', 'whenever', 'or',
                                             't...
                                             'see', 'six', ...],
                                 strip_accents=None, sublinear_tf=False,
             

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 16.4min
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed: 16.7min finished


gs_4 best train score: 0.8821762167476921. gs_4 best test score: 0.8761264951663117. The best model from this grid search is Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.85, max_features=1500,
                                 min_df=42, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=['cant', 'name', 'whereby',
                                             'everywhere', 'whenever', 'or',
                                             't...
                                             'anyway', 'than', 'fifty', 'had',
                                             'since', 'ten', 'please', 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   17.0s finished


gs_n_gram best train score: 0.8668268968154258. gs_n_gram best test score 0.8700639029985253. The best model from this grid search is Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(2, 3), preprocessor=None,
                                 stop_words=['cant', 'name', 'whereby',
                                             'everywhere', 'whenever', 'or',
                                             'to', 'here', 'her', 'almost',
                                             'inter...
                                             'see', 'six', ...],
                                 strip_accents=None,
         

## Confusion Matrix

## ROC AUC Cuve