In [20]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB,BernoulliNB
import time

In [21]:
thunder_3y = pd.read_csv('datasets/thunder/tc_3yr_10com.csv')
thunder_2y = pd.read_csv('datasets/thunder/tc_2yr_10com.csv')
thunder_1y = pd.read_csv('datasets/thunder/tc_1yr_10com.csv')

In [22]:
bulls_3y = pd.read_csv('datasets/bulls/bc_3yrs_10com.csv')
bulls_2y = pd.read_csv('datasets/bulls/bc_2yrs_10com.csv')
bulls_1y = pd.read_csv('datasets/bulls/bc_1yrs_10com.csv')

In [23]:
thunder_df = pd.concat([thunder_1y,thunder_2y,thunder_3y])

In [24]:
thunder_df.shape

(7309, 6)

In [25]:
bulls_df = pd.concat([bulls_3y,bulls_2y,bulls_1y])

In [26]:
bulls_df.shape

(8869, 6)

In [27]:
thunder_df['is_thunder'] = 1
bulls_df['is_thunder'] = 0

In [28]:
#code taken from Heather Robbins lesson

thunder_df['body+title']=thunder_df['body+title'].replace('http\S+', '', regex=True).replace('www\S+', '', regex=True).replace('\n\n\S+', '', regex=True).replace('\n\S+','',regex=True)

In [29]:

bulls_df['body+title']=bulls_df['body+title'].replace('http\S+', '', regex=True).replace('www\S+', '', regex=True).replace('\n\n\S+', '', regex=True).replace('\n\S+','',regex=True)

In [30]:
combined_df = pd.concat([thunder_df,bulls_df])

In [31]:
X = combined_df['body+title']
y = combined_df['is_thunder']

In [32]:
combined_df['is_thunder'].value_counts(normalize=True)

0    0.548214
1    0.451786
Name: is_thunder, dtype: float64

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=.4,
                                                    random_state=42)

In [126]:
from sklearn.feature_extraction import text 

stop_words = text.ENGLISH_STOP_WORDS.union(['removed','deleted'])


In [127]:
stop_words;

# Pipelines with Logistic Regression

In [128]:
cv_pipe_log = Pipeline(
    [
        ('cvec',CountVectorizer()),
        ('logreg',LogisticRegression())
    ]
)

tv_pipe_log = Pipeline(
    [
        ('tvec',TfidfVectorizer()),
        ('logreg',LogisticRegression())
    ]
)

In [129]:
cv_pipe_log_params ={
    'cvec__max_features':[5000,10000,15000],
    'cvec__stop_words':['english',
                        stop_words],
    'cvec__ngram_range':[(1,1),(1,2)],
    'logreg__penalty': ['l1','l2'],
    'logreg__n_jobs':[-1],
    'logreg__C':[.01,.5,1]
}

tv_pipe_log_params ={
    'tvec__max_features':[5000,10000,15000],
    'tvec__stop_words':['english',stop_words],
    'tvec__ngram_range':[(1,1),(1,2)],
    'logreg__penalty': ['l1','l2'],
    'logreg__n_jobs':[-1],
    'logreg__C':[.01,.5,1]
}

In [130]:
t0=time.time()
gs = GridSearchCV(cv_pipe_log, param_grid=cv_pipe_log_params, cv=3)
gs.fit(X_train, y_train)
print(time.time()-t0)
print(gs.best_score_)
gs.best_params_

  " = {}.".format(effective_n_jobs(self.n_jobs)))


203.4642448425293
0.9118071296105502


{'cvec__max_features': 15000,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': frozenset({'a',
            'about',
            'above',
            'across',
            'after',
            'afterwards',
            'again',
            'against',
            'all',
            'almost',
            'alone',
            'along',
            'already',
            'also',
            'although',
            'always',
            'am',
            'among',
            'amongst',
            'amoungst',
            'amount',
            'an',
            'and',
            'another',
            'any',
            'anyhow',
            'anyone',
            'anything',
            'anyway',
            'anywhere',
            'are',
            'around',
            'as',
            'at',
            'back',
            'be',
            'became',
            'because',
            'become',
            'becomes',
            'becoming',
            'been',
            'before',
    

In [131]:
t0=time.time()
gs = GridSearchCV(tv_pipe_log, param_grid=tv_pipe_log_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_

  " = {}.".format(effective_n_jobs(self.n_jobs)))


0.9160313208324747
185.87433290481567


{'logreg__C': 1,
 'logreg__n_jobs': -1,
 'logreg__penalty': 'l2',
 'tvec__max_features': 15000,
 'tvec__ngram_range': (1, 2),
 'tvec__stop_words': 'english'}

In [132]:
combined_df.shape

(16178, 7)

# Pipelines with Naive Bayes Models

In [133]:
cv_pipe_mnb = Pipeline(
    [
        ('cvec',CountVectorizer()),

        ('mnb',MultinomialNB())
    ]
)

tv_pipe_mnb = Pipeline(
    [
        ('tvec',TfidfVectorizer()),

        ('mnb',MultinomialNB())
    ]
)

In [141]:
cv_pipe_mnb_params ={
    'cvec__max_features':[5000,10000,15000],
    'cvec__stop_words':['english',stop_words],
    'cvec__ngram_range':[(1,1),(1,2)],
    'mnb__alpha':[0.01,0.5,1],
}

tv_pipe_mnb_params ={
    'tvec__max_features':[5000,10000,15000],
    'tvec__stop_words':['english',stop_words],
    'tvec__ngram_range':[(1,1),(1,2)],
    'mnb__alpha':[0.01,0.5,1]
}

In [142]:
t0=time.time()
gs = GridSearchCV(cv_pipe_mnb,param_grid=cv_pipe_mnb_params,cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_

0.9060375025757264
72.72567319869995


{'cvec__max_features': 15000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'mnb__alpha': 1}

In [143]:
t0=time.time()
gs = GridSearchCV(tv_pipe_mnb,param_grid=tv_pipe_mnb_params,cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_


0.910261693797651
74.26057291030884


{'mnb__alpha': 1,
 'tvec__max_features': 15000,
 'tvec__ngram_range': (1, 2),
 'tvec__stop_words': 'english'}

In [144]:
cv_pipe_bnb = Pipeline(
    [
        ('cvec',CountVectorizer()),

        ('bnb',BernoulliNB())
    ]
)

tv_pipe_bnb = Pipeline(
    [
        ('tvec',TfidfVectorizer()),

        ('bnb',BernoulliNB())
    ]
)

In [145]:
cv_pipe_bnb_params ={
    'cvec__max_features':[5000,10000,15000],
    'cvec__stop_words':['english',stop_words],
    'cvec__ngram_range':[(1,1),(1,2)],
    'bnb__alpha':[0.01,0.5,1],
}

tv_pipe_bnb_params ={
    'tvec__max_features':[5000,10000,15000],
    'tvec__stop_words':['english',stop_words],
    'tvec__ngram_range':[(1,1),(1,2)],
    'bnb__alpha':[0.01,0.5,1]
}

In [146]:
t0=time.time()
gs = GridSearchCV(cv_pipe_bnb,param_grid=cv_pipe_bnb_params,cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_

0.8948073356686586
73.96291494369507


{'bnb__alpha': 1,
 'cvec__max_features': 15000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': frozenset({'a',
            'about',
            'above',
            'across',
            'after',
            'afterwards',
            'again',
            'against',
            'all',
            'almost',
            'alone',
            'along',
            'already',
            'also',
            'although',
            'always',
            'am',
            'among',
            'amongst',
            'amoungst',
            'amount',
            'an',
            'and',
            'another',
            'any',
            'anyhow',
            'anyone',
            'anything',
            'anyway',
            'anywhere',
            'are',
            'around',
            'as',
            'at',
            'back',
            'be',
            'became',
            'because',
            'become',
            'becomes',
            'becoming',
            'been',
        

In [147]:
t0=time.time()
gs = GridSearchCV(tv_pipe_bnb,param_grid=tv_pipe_bnb_params,cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_


0.8948073356686586
76.70315432548523


{'bnb__alpha': 1,
 'tvec__max_features': 15000,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': frozenset({'a',
            'about',
            'above',
            'across',
            'after',
            'afterwards',
            'again',
            'against',
            'all',
            'almost',
            'alone',
            'along',
            'already',
            'also',
            'although',
            'always',
            'am',
            'among',
            'amongst',
            'amoungst',
            'amount',
            'an',
            'and',
            'another',
            'any',
            'anyhow',
            'anyone',
            'anything',
            'anyway',
            'anywhere',
            'are',
            'around',
            'as',
            'at',
            'back',
            'be',
            'became',
            'because',
            'become',
            'becomes',
            'becoming',
            'been',
        