In [21]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB,BernoulliNB
from sklearn.ensemble import RandomForestClassifier

import time

In [22]:
thunder_3y = pd.read_csv('../datasets/thunder/tc_3yr_1com.csv')
thunder_2y = pd.read_csv('../datasets/thunder/tc_2yr_1com.csv')
thunder_1y = pd.read_csv('../datasets/thunder/tc_1yr_1com.csv')

In [23]:
bulls_3y = pd.read_csv('../datasets/bulls/bc_3yrs_1com.csv')
bulls_2y = pd.read_csv('../datasets/bulls/bc_2yrs_1com.csv')
bulls_1y = pd.read_csv('../datasets/bulls/bc_1yrs_1com.csv')

In [24]:
thunder_df = pd.concat([thunder_1y,thunder_2y,thunder_3y])

In [25]:
thunder_df.shape

(17290, 6)

In [26]:
bulls_df = pd.concat([bulls_3y,bulls_2y,bulls_1y])

In [27]:
bulls_df.shape

(14720, 6)

In [28]:
bulls_df.head();

In [29]:
thunder_df['is_thunder'] = 1
bulls_df['is_thunder'] = 0

In [30]:
thunder_df.head();

In [46]:
thunder_df['body+title'];

In [32]:
#code taken from Heather Robbins lesson

thunder_df['body+title']=thunder_df['body+title'].replace('http\S+', '', regex=True).replace('www\S+', '', regex=True).replace('\n\n\S+', '', regex=True).replace('\n\S+','',regex=True)




In [47]:
thunder_df['body+title'];

In [34]:

bulls_df['body+title']=bulls_df['body+title'].replace('http\S+', '', regex=True).replace('www\S+', '', regex=True).replace('\n\n\S+', '', regex=True).replace('\n\S+','',regex=True)

In [35]:
combined_df = pd.concat([thunder_df,bulls_df])

In [36]:
combined_df.shape

(32010, 7)

In [37]:
combined_df.duplicated().sum()

0

In [38]:
X = combined_df['body+title']
y = combined_df['is_thunder']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.4,random_state=42)

In [40]:
#adding additional words to stopwords
#code from
#https://stackoverflow.com/questions/24386489/adding-words-to-scikit-learns-countvectorizers-stop-list

from sklearn.feature_extraction import text 

stop_words = text.ENGLISH_STOP_WORDS.union(['deleted','removed'])





# Pipelines with Logistic Regression

In [45]:
#pipelines with logistic regression predictor

In [46]:
cv_pipe_log = Pipeline(
    [
        ('cvec',CountVectorizer()),
        ('logreg',LogisticRegression())
    ]
)

tv_pipe_log = Pipeline(
    [
        ('tvec',TfidfVectorizer()),
        ('logreg',LogisticRegression())
    ]
)

In [47]:
cv_pipe_log_params ={
    'cvec__max_features':[5000,10000,15000],
    'cvec__stop_words':['english',
                        stop_words],
    'cvec__ngram_range':[(1,1),(1,2)],
    'logreg__penalty': ['l1','l2'],
    'logreg__n_jobs':[-1],
    'logreg__C':[.01,.5,1]
}

tv_pipe_log_params ={
    'tvec__max_features':[5000,10000,15000],
    'tvec__stop_words':['english',stop_words],
    'tvec__ngram_range':[(1,1),(1,2)],
    'logreg__penalty': ['l1','l2'],
    'logreg__n_jobs':[-1],
    'logreg__C':[.01,.5,1]
}

In [48]:
t0=time.time()
gs = GridSearchCV(cv_pipe_log, param_grid=cv_pipe_log_params, cv=3)
gs.fit(X_train, y_train)
print(time.time()-t0)
print(gs.best_score_)
gs.best_params_
#0.909

  " = {}.".format(effective_n_jobs(self.n_jobs)))


314.9114422798157
0.9091429761532854


{'cvec__max_features': 15000,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english',
 'logreg__C': 0.5,
 'logreg__n_jobs': -1,
 'logreg__penalty': 'l2'}

In [49]:
t0=time.time()
gs = GridSearchCV(tv_pipe_log, param_grid=tv_pipe_log_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_

  " = {}.".format(effective_n_jobs(self.n_jobs)))


0.9117983963344788
242.40239214897156


{'logreg__C': 1,
 'logreg__n_jobs': -1,
 'logreg__penalty': 'l2',
 'tvec__max_features': 15000,
 'tvec__ngram_range': (1, 2),
 'tvec__stop_words': frozenset({'a',
            'about',
            'above',
            'across',
            'after',
            'afterwards',
            'again',
            'against',
            'all',
            'almost',
            'alone',
            'along',
            'already',
            'also',
            'although',
            'always',
            'am',
            'among',
            'amongst',
            'amoungst',
            'amount',
            'an',
            'and',
            'another',
            'any',
            'anyhow',
            'anyone',
            'anything',
            'anyway',
            'anywhere',
            'are',
            'around',
            'as',
            'at',
            'back',
            'be',
            'became',
            'because',
            'become',
            'becomes',
    

# Pipelines for Naive Bayes models
### Multinomial

In [50]:
# pipelines in MNB model
cv_pipe_mnb = Pipeline(
    [
        ('cvec',CountVectorizer()),

        ('mnb',MultinomialNB())
    ]
)

tv_pipe_mnb = Pipeline(
    [
        ('tvec',TfidfVectorizer()),

        ('mnb',MultinomialNB())
    ]
)

In [51]:
cv_pipe_mnb_params ={
    'cvec__max_features':[5000,10000,15000],
    'cvec__stop_words':['english',stop_words],
    'cvec__ngram_range':[(1,1),(1,2)],
    'mnb__alpha':[0.01,0.5,1],
}

tv_pipe_mnb_params ={
    'tvec__max_features':[5000,10000,15000],
    'tvec__stop_words':['english',stop_words],
    'tvec__ngram_range':[(1,1),(1,2)],
    'mnb__alpha':[0.01,0.5,1]
}

In [52]:
t0=time.time()
gs = GridSearchCV(cv_pipe_mnb,param_grid=cv_pipe_mnb_params,cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_


0.9058106841611996
117.24808502197266


{'cvec__max_features': 15000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'mnb__alpha': 1}

In [53]:
t0=time.time()
gs = GridSearchCV(tv_pipe_mnb,param_grid=tv_pipe_mnb_params,cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_


0.9025304592314901
117.97853803634644


{'mnb__alpha': 1,
 'tvec__max_features': 5000,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': 'english'}

### Binomial Naive Bayes

In [54]:
cv_pipe_bnb = Pipeline(
    [
        ('cvec',CountVectorizer()),

        ('bnb',BernoulliNB())
    ]
)

tv_pipe_bnb = Pipeline(
    [
        ('tvec',TfidfVectorizer()),

        ('bnb',BernoulliNB())
    ]
)

In [55]:
cv_pipe_bnb_params ={
    'cvec__max_features':[5000,10000,15000],
    'cvec__stop_words':['english',stop_words],
    'cvec__ngram_range':[(1,1),(1,2)],
    'bnb__alpha':[0.01,0.5,1],
}

tv_pipe_bnb_params ={
    'tvec__max_features':[5000,10000,15000],
    'tvec__stop_words':['english',stop_words],
    'tvec__ngram_range':[(1,1),(1,2)],
    'bnb__alpha':[0.01,0.5,1]
}

In [56]:
t0=time.time()
gs = GridSearchCV(cv_pipe_bnb,param_grid=cv_pipe_bnb_params,cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_


0.8962303446839529
117.47308397293091


{'bnb__alpha': 0.01,
 'cvec__max_features': 5000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': frozenset({'a',
            'about',
            'above',
            'across',
            'after',
            'afterwards',
            'again',
            'against',
            'all',
            'almost',
            'alone',
            'along',
            'already',
            'also',
            'although',
            'always',
            'am',
            'among',
            'amongst',
            'amoungst',
            'amount',
            'an',
            'and',
            'another',
            'any',
            'anyhow',
            'anyone',
            'anything',
            'anyway',
            'anywhere',
            'are',
            'around',
            'as',
            'at',
            'back',
            'be',
            'became',
            'because',
            'become',
            'becomes',
            'becoming',
            'been',
      

In [57]:
t0=time.time()
gs = GridSearchCV(tv_pipe_bnb,param_grid=tv_pipe_bnb_params,cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_

0.8962303446839529
119.99419903755188


{'bnb__alpha': 0.01,
 'tvec__max_features': 5000,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': frozenset({'a',
            'about',
            'above',
            'across',
            'after',
            'afterwards',
            'again',
            'against',
            'all',
            'almost',
            'alone',
            'along',
            'already',
            'also',
            'although',
            'always',
            'am',
            'among',
            'amongst',
            'amoungst',
            'amount',
            'an',
            'and',
            'another',
            'any',
            'anyhow',
            'anyone',
            'anything',
            'anyway',
            'anywhere',
            'are',
            'around',
            'as',
            'at',
            'back',
            'be',
            'became',
            'because',
            'become',
            'becomes',
            'becoming',
            'been',
      

# Random Forest

### Due to computational cost these models will not be run for the other data sets

In [41]:
cv_pipe_rf = Pipeline(
    [
        ('cvec',CountVectorizer()),

        ('rf',RandomForestClassifier(random_state=13,n_jobs=-1))
    ]
)

tv_pipe_rf = Pipeline(
    [
        ('tvec',TfidfVectorizer()),

        ('rf',RandomForestClassifier(random_state=13,n_jobs=-1))
    ]
)

In [42]:
cv_pipe_rf_params ={
    'cvec__max_features':[5000,10000,15000],
    'cvec__stop_words':['english',stop_words],
    'cvec__ngram_range':[(1,1),(1,2)],
    'rf__max_depth':[3,5,10],
    'rf__min_samples_leaf':[1,5],
    'rf__n_estimators':[10,100]
}

tv_pipe_rf_params ={
    'tvec__max_features':[5000,10000,15000
                          ],
    'tvec__stop_words':['english',stop_words],
    'tvec__ngram_range':[(1,1),(1,2)],
    'rf__max_depth':[3,5,10],
    'rf__min_samples_leaf':[1,5],
    'rf__n_estimators':[10,100]
}

In [44]:
t0=time.time()
gs = GridSearchCV(cv_pipe_rf,param_grid=cv_pipe_rf_params,cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_

0.8212537748620223
595.181755065918


{'cvec__max_features': 5000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'rf__max_depth': 10,
 'rf__min_samples_leaf': 1,
 'rf__n_estimators': 100}

In [45]:
t0=time.time()
gs = GridSearchCV(tv_pipe_rf,param_grid=tv_pipe_rf_params,cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_

0.823336457357076
596.2778990268707


{'rf__max_depth': 10,
 'rf__min_samples_leaf': 1,
 'rf__n_estimators': 100,
 'tvec__max_features': 5000,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': 'english'}