In [62]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
%matplotlib inline

from nltk.tokenize import RegexpTokenizer
from nltk import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import  MultinomialNB

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


### Reading in the Data

In [2]:
df = pd.read_csv('dataset/final_data.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

### Setting x and y variables

In [3]:
X = df['combined_title_&_comment']
y = df['subreddit']

### Train Test Split 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

## 1. TFIDF and Logistic Regression 

In [5]:
pipe_1 = Pipeline([
    ('tf', TfidfVectorizer()),
    ('lr', LogisticRegression())])

pipe_1_params = {
    'tf__max_features': [2500, 3000, 3500],
    'tf__min_df': [2, 3],
    'tf__max_df': [0.9, 0.95],
    'tf__ngram_range': [(1,1), (1,2)],
    'lr__penalty' : ['l1', 'l2'],
    'lr__C': np.logspace(-4, 1, 4, 20),
    'lr__random_state': [40, 100] }

In [6]:
gs_1 = GridSearchCV(pipe_1, pipe_1_params, cv =3, verbose =1  )

In [7]:
gs_1.fit(X_train, y_train)

Fitting 3 folds for each of 384 candidates, totalling 1152 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1152 out of 1152 | elapsed:  5.8min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'tf__max_features': [2500, 3000, 3500], 'tf__min_df': [2, 3], 'tf__max_df': [0.9, 0.95], 'tf__ngram_range': [(1, 1), (1, 2)], 'lr__penalty': ['l1', 'l2'], 'lr__C': array([1.00000e-04, 4.64159e-03, 2.15443e-01, 1.00000e+01]), 'lr__random_state': [40, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [30]:
gs_1.best_score_

0.6986666666666667

In [73]:
gs_1.best_params_

{'lr__C': 10.0,
 'lr__penalty': 'l2',
 'lr__random_state': 40,
 'tf__max_df': 0.9,
 'tf__max_features': 2500,
 'tf__min_df': 2,
 'tf__ngram_range': (1, 1)}

## 2. CV and Logistic Regression

In [31]:
pipe_2 = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())])

pipe_2_params = {
    'cvec__max_features': [2500, 3000, 3500],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [0.9, 0.95],
    'cvec__ngram_range': [(1,1), (1,2)],
    'lr__penalty' : ['l1', 'l2'],
    'lr__C': np.logspace(-4, 1, 4, 20),
    'lr__random_state': [40, 100] }

In [32]:
gs_2 = GridSearchCV(pipe_2, pipe_2_params, cv = 3, verbose =1 )

In [33]:
gs_2.fit(X_train, y_train)

Fitting 3 folds for each of 384 candidates, totalling 1152 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1152 out of 1152 | elapsed:  5.8min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'cvec__max_features': [2500, 3000, 3500], 'cvec__min_df': [2, 3], 'cvec__max_df': [0.9, 0.95], 'cvec__ngram_range': [(1, 1), (1, 2)], 'lr__penalty': ['l1', 'l2'], 'lr__C': array([1.00000e-04, 4.64159e-03, 2.15443e-01, 1.00000e+01]), 'lr__random_state': [40, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [34]:
gs_2.best_score_

0.688

## 3. TFIDF and RandomForest 

In [9]:
pipe_3 = Pipeline([
    ('tf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
    ])

pipe_3_params = {
    'tf__max_features': [2500, 3000, 3500],
    'tf__min_df': [2,3],
    'tf__max_df': [0.9, 0.95], 
    'tf__ngram_range': [(1,1), (1,2)],
    'rf__bootstrap' : [True, False],
    'rf__max_depth' : [10, 50, 100],
    'rf__min_samples_leaf': [1,2,4],
    'rf__min_samples_split': [2, 10], 
    'rf__n_estimators': [200, 500, 1000]
}

In [15]:
rs_3 = RandomizedSearchCV(pipe_3, pipe_3_params, cv =5, verbose =1, n_jobs=-1  )

In [16]:
rs_3.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   23.7s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('tf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'tf__max_features': [2500, 3000, 3500], 'tf__min_df': [2, 3], 'tf__max_df': [0.9, 0.95], 'tf__ngram_range': [(1, 1), (1, 2)], 'rf__bootstrap': [True, False], 'rf__max_depth': [10, 50, 100], 'rf__min_samples_leaf': [1, 2, 4], 'rf__min_samples_split': [2, 10], 'rf__n_estimators': [200, 500, 1000]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='war

In [14]:
rs_3.best_score_

0.662

# 4. CV and Random Forest 

In [None]:
pipe_4 = Pipeline([
    ('cvec', CountVectorizer()),
     ('rf', RandomForestClassifier())
    ])

## 5. TFIDF and Extratrees 

In [19]:
pipe_5 = Pipeline([
    ('tf', TfidfVectorizer()),
    ('et', ExtraTreesClassifier())
])

pipe_5_params = {'tf__max_features': [2500, 3000, 3500],
    'tf__min_df': [2,3],
    'tf__max_df': [0.9, 0.95], 
    'tf__ngram_range': [(1,1), (1,2)],
    'et__max_depth' : [10, 50, 100],
    'et__min_samples_leaf': [1,2,4],
    'et__min_samples_split': [2, 5, 10], 
    'et__n_estimators': [200, 500, 1000]}
    


In [20]:
gs_5 = RandomizedSearchCV(pipe_5, pipe_5_params, cv = 3, verbose =1, n_jobs = -1)

In [21]:
gs_5.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   10.8s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('tf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...s='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'tf__max_features': [2500, 3000, 3500], 'tf__min_df': [2, 3], 'tf__max_df': [0.9, 0.95], 'tf__ngram_range': [(1, 1), (1, 2)], 'et__max_depth': [10, 50, 100], 'et__min_samples_leaf': [1, 2, 4], 'et__min_samples_split': [2, 5, 10], 'et__n_estimators': [200, 500, 1000]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [22]:
gs_5.best_score_

0.6646666666666666

## 6. TFIDF and Bagging 

In [25]:
pipe_6 = Pipeline([
    ('tf', TfidfVectorizer()),
    ('bag', ExtraTreesClassifier())
])

pipe_6_params = {
    'tf__min_df': [2,3],
    'tf__max_df': [0.9, 0.95], 
    'tf__ngram_range': [(1,1), (1,2)],
    'bag__max_features' : [100, 200],
    'bag__bootstrap': [True, False]}


In [26]:
gs_6 = RandomizedSearchCV(pipe_6, pipe_6_params, cv =3, verbose =1, n_jobs= -1)

In [27]:
gs_6.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    2.3s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('tf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...s='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'tf__min_df': [2, 3], 'tf__max_df': [0.9, 0.95], 'tf__ngram_range': [(1, 1), (1, 2)], 'bag__max_features': [100, 200], 'bag__bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [28]:
gs_6.best_score_

0.6546666666666666

## 7. TFIDF and SVC 

In [35]:
pipe_7 = Pipeline([
    ('tf', TfidfVectorizer()),
    ('svc', SVC())])
    
pipe_7_params = {
    'tf__min_df': [2,3],
    'tf__max_df': [0.9, 0.95], 
    'tf__ngram_range': [(1,1), (1,2)],
    'svc__C' : [50,100],
    'svc__kernel': ['linear', 'rbf', 'sigmoid'],
    'svc__gamma': ['scale']
}

In [36]:
gs_7 = RandomizedSearchCV(pipe_7, pipe_7_params, cv =3, verbose =1, n_jobs= -1)

In [37]:
gs_7.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    8.4s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('tf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...f', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'tf__min_df': [2, 3], 'tf__max_df': [0.9, 0.95], 'tf__ngram_range': [(1, 1), (1, 2)], 'svc__C': [50, 100], 'svc__kernel': ['linear', 'rbf', 'sigmoid'], 'svc__gamma': ['scale']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [38]:
gs_7.best_score_

0.7113333333333334

In [61]:
gs_7.best_params_

{'tf__ngram_range': (1, 2),
 'tf__min_df': 2,
 'tf__max_df': 0.9,
 'svc__kernel': 'rbf',
 'svc__gamma': 'scale',
 'svc__C': 50}

## 8. TFIDF and AdaBoostClassifer 

In [46]:
pipe_8 = Pipeline([
    ('tf', TfidfVectorizer()),
    ('ada', AdaBoostClassifier())])
pipe_8_params = {'tf__min_df': [2,3],
    'tf__max_df': [0.9, 0.95], 
    'tf__ngram_range': [(1,1), (1,2)],
    'ada__n_estimators': [50,100],    
    'ada__learning_rate': [.9, 1.]}

In [47]:
gs_8 = RandomizedSearchCV(pipe_8, pipe_8_params, cv =3, verbose =1, n_jobs= -1)

In [48]:
gs_8.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.6s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('tf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...m='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'tf__min_df': [2, 3], 'tf__max_df': [0.9, 0.95], 'tf__ngram_range': [(1, 1), (1, 2)], 'ada__n_estimators': [50, 100], 'ada__learning_rate': [0.9, 1.0]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [49]:
gs_8.best_score_

0.6326666666666667

## 9. TFIDF and GradientBoosting Classifer 

In [52]:
pipe_9 = Pipeline([
    ('tf', TfidfVectorizer()),
    ('gboost', GradientBoostingClassifier())])

pipe_9_params = {'tf__min_df': [2,3],
    'tf__max_df': [0.9, 0.95], 
    'tf__ngram_range': [(1,1), (1,2)],
    'gboost__max_depth' : [2,3,4],
    'gboost__n_estimators': [100,125, 150], 
     'gboost__learning_rate': [0.08, .1, .12]   }

In [53]:
gs_9 = RandomizedSearchCV(pipe_9, pipe_9_params, cv =3, verbose =1, n_jobs= -1)

In [54]:
gs_9.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   16.1s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('tf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...    subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'tf__min_df': [2, 3], 'tf__max_df': [0.9, 0.95], 'tf__ngram_range': [(1, 1), (1, 2)], 'gboost__max_depth': [2, 3, 4], 'gboost__n_estimators': [100, 125, 150], 'gboost__learning_rate': [0.08, 0.1, 0.12]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [55]:
gs_9.best_score_

0.6693333333333333

## 10 TFIDF and Multinomial NB

In [69]:
pipe_10 = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb' , MultinomialNB())])

pipe_10_params = {
    'tvec__stop_words' : ['english', None],
    'tvec__max_features' : [2500, 3000, 3500],
    'tvec__min_df': [1,2],
    'tvec__max_df': [200, 300],
    'tvec__ngram_range': ([1,1], (1,2), (1,3)),
    "nb__fit_prior": [None, True]}

In [70]:
gs_10 = RandomizedSearchCV(pipe_10, pipe_10_params, cv =3, verbose =1, n_jobs= -1)

In [71]:
gs_10.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    4.4s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('tvec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...True,
        vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'tvec__stop_words': ['english', None], 'tvec__max_features': [2500, 3000, 3500], 'tvec__min_df': [1, 2], 'tvec__max_df': [200, 300], 'tvec__ngram_range': ([1, 1], (1, 2), (1, 3)), 'nb__fit_prior': [None, True]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [72]:
gs_10.best_score_

0.6833333333333333