In [1]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB,BernoulliNB
import time

In [2]:
thunder_3y = pd.read_csv('../datasets/thunder/tc_3yr_4com.csv')
thunder_2y = pd.read_csv('../datasets/thunder/tc_2yr_4com.csv')
thunder_1y = pd.read_csv('../datasets/thunder/tc_1yr_4com.csv')

In [3]:
bulls_3y = pd.read_csv('../datasets/bulls/bc_3yrs_4com.csv')
bulls_2y = pd.read_csv('../datasets/bulls/bc_2yrs_4com.csv')
bulls_1y = pd.read_csv('../datasets/bulls/bc_1yrs_4com.csv')

In [4]:
thunder_df = pd.concat([thunder_1y,thunder_2y,thunder_3y])

In [5]:
thunder_df.shape

(12707, 6)

In [6]:
bulls_df = pd.concat([bulls_3y,bulls_2y,bulls_1y])

In [7]:
bulls_df.shape

(11891, 6)

In [8]:
thunder_df['is_thunder'] = 1
bulls_df['is_thunder'] = 0

In [9]:
#code taken from heather robbins lesson

thunder_df['body+title']=thunder_df['body+title'].replace('http\S+', '', regex=True).replace('www\S+', '', regex=True).replace('\n\n\S+', '', regex=True).replace('\n\S+','',regex=True)

In [10]:

bulls_df['body+title']=bulls_df['body+title'].replace('http\S+', '', regex=True).replace('www\S+', '', regex=True).replace('\n\n\S+', '', regex=True).replace('\n\S+','',regex=True)

In [11]:
thunder_df.shape

(12707, 7)

In [12]:
bulls_df.shape

(11891, 7)

In [13]:
combined_df = pd.concat([thunder_df,bulls_df])

In [14]:
X = combined_df['body+title']
y = combined_df['is_thunder']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=42)

In [16]:

from sklearn.feature_extraction import text 

stop_words = text.ENGLISH_STOP_WORDS.union(['deleted','removed'])

In [17]:
stop_words;

In [18]:
#pipelines with logistic regression predictor

# Pipeline with Logistic Regression

In [19]:
cv_pipe_log = Pipeline(
    [
        ('cvec',CountVectorizer()),
        ('logreg',LogisticRegression())
    ]
)

tv_pipe_log = Pipeline(
    [
        ('tvec',TfidfVectorizer()),
        ('logreg',LogisticRegression())
    ]
)

In [20]:
cv_pipe_log_params ={
    'cvec__max_features':[5000,10000,15000],
    'cvec__stop_words':['english',stop_words],
    'cvec__ngram_range':[(1,1),(1,2)],
    'logreg__penalty': ['l1','l2'],
    'logreg__n_jobs':[-1],
    'logreg__C':[.01,.5,1]
}

tv_pipe_log_params ={
    'tvec__max_features':[5000,10000,15000],
    'tvec__stop_words':['english',stop_words],
    'tvec__ngram_range':[(1,1),(1,2)],
    'logreg__penalty': ['l1','l2'],
    'logreg__n_jobs':[-1],
    'logreg__C':[.01,.5,1]
}

In [21]:
X_train.shape,y_train.shape

((14758,), (14758,))

In [74]:
t0=time.time()
gs = GridSearchCV(cv_pipe_log, param_grid=cv_pipe_log_params, cv=3)
gs.fit(X_train, y_train)
print(time.time()-t0)
print(gs.best_score_)
gs.best_params_
#0.9108

  " = {}.".format(effective_n_jobs(self.n_jobs)))


255.73002696037292
0.910828025477707


{'cvec__max_features': 15000,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': frozenset({'a',
            'about',
            'above',
            'across',
            'after',
            'afterwards',
            'again',
            'against',
            'all',
            'almost',
            'alone',
            'along',
            'already',
            'also',
            'although',
            'always',
            'am',
            'among',
            'amongst',
            'amoungst',
            'amount',
            'an',
            'and',
            'another',
            'any',
            'anyhow',
            'anyone',
            'anything',
            'anyway',
            'anywhere',
            'are',
            'around',
            'as',
            'at',
            'back',
            'be',
            'became',
            'because',
            'become',
            'becomes',
            'becoming',
            'been',
            'before',
    

In [75]:
t0=time.time()
gs = GridSearchCV(tv_pipe_log, param_grid=tv_pipe_log_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_


  " = {}.".format(effective_n_jobs(self.n_jobs)))


0.9136061796991463
209.40227627754211


{'logreg__C': 1,
 'logreg__n_jobs': -1,
 'logreg__penalty': 'l2',
 'tvec__max_features': 15000,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': 'english'}

# Pipelines with Naive Bayes models

In [77]:
# pipelines in MNB model
cv_pipe_mnb = Pipeline(
    [
        ('cvec',CountVectorizer()),
        ('mnb',MultinomialNB())
    ]
)

tv_pipe_mnb = Pipeline(
    [
        ('tvec',TfidfVectorizer()),
        ('mnb',MultinomialNB())
    ]
)

In [78]:
cv_pipe_mnb_params ={
    'cvec__max_features':[5000,10000,15000],
    'cvec__stop_words':['english',stop_words],
    'cvec__ngram_range':[(1,1),(1,2)],
    'mnb__alpha':[0.01,0.5,1],
}

tv_pipe_mnb_params ={
    'tvec__max_features':[5000,10000,15000],
    'tvec__stop_words':['english',stop_words],
    'tvec__ngram_range':[(1,1),(1,2)],
    'mnb__alpha':[0.01,0.5,1]
}

In [79]:
t0=time.time()
gs = GridSearchCV(cv_pipe_mnb,param_grid=cv_pipe_mnb_params,cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_


0.9100826670280526
104.763356924057


{'cvec__max_features': 15000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'mnb__alpha': 1}

In [80]:
t0=time.time()
gs = GridSearchCV(tv_pipe_mnb,param_grid=tv_pipe_mnb_params,cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_


0.9054072367529475
104.89422297477722


{'mnb__alpha': 0.5,
 'tvec__max_features': 15000,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': 'english'}

In [81]:
cv_pipe_bnb = Pipeline(
    [
        ('cvec',CountVectorizer()),
        ('bnb',BernoulliNB())
    ]
)

tv_pipe_bnb = Pipeline(
    [
        ('tvec',TfidfVectorizer()),
        ('bnb',BernoulliNB())
    ]
)

In [82]:
cv_pipe_bnb_params ={
    'cvec__max_features':[5000,10000,15000],
    'cvec__stop_words':['english',stop_words],
    'cvec__ngram_range':[(1,1),(1,2)],
    'bnb__alpha':[0.01,0.5,1],
}

tv_pipe_bnb_params ={
    'tvec__max_features':[5000,10000,15000],
    'tvec__stop_words':['english',stop_words],
    'tvec__ngram_range':[(1,1),(1,2)],
    'bnb__alpha':[0.01,0.5,1]
}

In [83]:
t0=time.time()
gs = GridSearchCV(cv_pipe_bnb,param_grid=cv_pipe_bnb_params,cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_


0.8930749424041198
104.69995093345642


{'bnb__alpha': 0.01,
 'cvec__max_features': 5000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english'}

In [84]:
t0=time.time()
gs = GridSearchCV(tv_pipe_bnb,param_grid=tv_pipe_bnb_params,cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_


0.8930749424041198
104.54622983932495


{'bnb__alpha': 0.01,
 'tvec__max_features': 5000,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': 'english'}

In [85]:
combined_df.shape

(24598, 7)