# Project 3. Web API's and NLP, Reddit Modelling

This notebook takes the data which was cleaned and explored in the previous notebook, then models it in various ways.

In [1]:
# Import Several required libraries
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.svm import SVC

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from sklearn.metrics import confusion_matrix, recall_score

In [2]:
# Load in the data
data = pd.read_csv('./data/reddit_clean.csv')

In [3]:
# Binarize the target variable
data['is_ac'] = (data['subreddit']=='AskCulinary').astype(int)

In [4]:
# Display a single row just to remind us what we are working with.
data.head(1)

Unnamed: 0,author,link_flair_text,num_comments,selftext,subreddit,title,created_utc,title_word_count,text_word_count,words,is_ac
0,ESK777,Equipment Question,30,I just bought a Le Creuset enameled cast iron ...,AskCulinary,Le Creuset enameled cast iron for bread baking,1606798265,8,68,Le Creuset enameled cast iron for bread baking...,1


## Transform Data

Create a stemmed features column a lemmatized features column and the train test split. This will be further transformed into word vectors, but it is important that the vecotrs are fit based on the training set to prevent data leakage. In the EDA notebook this step wasn't taken and the numbers there reflect the entire dataset.

In [6]:
# Create features with the porter stemmer library
# This one takes a few seconds to run
stm = PorterStemmer()
data['stem'] = [word_tokenize(doc) for doc in data['words']]
for i in range(len(data)):
    data['stem'].iloc[i] = [stm.stem(word) for word in data['stem'].iloc[i]].copy()
data['stem'] = [' '.join(stm_list) for stm_list in data['stem']] 

In [7]:
# Create the lemmatized features column
lem = WordNetLemmatizer()
data['lem'] = [word_tokenize(doc) for doc in data['words']]
for i in range(len(data)):
    data['lem'].iloc[i] = [lem.lemmatize(word) for word in data['lem'].iloc[i]].copy()
data['lem'] = [' '.join(lem_list) for lem_list in data['lem']] 

In [8]:
# Create the features matrix and target series. Then split them.
X = data.drop(columns='is_ac')
y = data['is_ac']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=101)

## Baseline Accuracy Score
This score is the simplest possible case where the prediction is simply the majority class every time. Any model should outperform this score.

In [9]:
y_train.value_counts(normalize=True)
print(f'The baseline accuracy score is {y_train.value_counts(normalize=True)[1]}')

The baseline accuracy score is 0.5055882950619792


## Create Models
The models I create are logistic regrerssion, KNN, and RandomForest, with a few different transformers applied to the text data. A summary of accuracy scores will be produced later in this notebook.

### Logistic Regression

In [10]:
# This is a list of the token sets to automate the calculations.
word_sets = ['words', 'lem', 'stem']

In [11]:
# This pipeline is for: Logistic Regression
# Pipeline adapted from NLP lab review. Thanks for pointing this out Gabe! 
log_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('tf', TfidfTransformer()),
    ('lr', LogisticRegression(solver = 'liblinear'))
])

log_params = {
    'cv__max_features': [100, 500],
    'cv__ngram_range' : [(1,1), (1,2)],
    'cv__stop_words'  : ['english', None],
    'tf__use_idf': [True, False] #if True, acts like TFIDF, if False, acts like CountVectorizer
}

In [12]:
# Loop through all my token sets doing a gridsearch on each
log_models = {}
for tokens in word_sets:
    
    gs = GridSearchCV(estimator = log_pipe,
                           param_grid = log_params,
                           cv = 5,
                           n_jobs = 6)
    gs.fit(X_train[tokens],y_train);
    
    log_models[tokens] = gs
    print(f'Score for {tokens}:   {gs.best_score_}')

Score for words:   0.7146919235689818
Score for lem:   0.7248501919029343
Score for stem:   0.7205837563451777


### K Nearest Neighbors

In [13]:
# This pipeline is for: K Nearest Neighbors Classifier
knn_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('tf', TfidfTransformer()),
    ('knn', KNeighborsClassifier())
])

knn_params = {
    'cv__max_features': [100, 500],
    'cv__ngram_range' : [(1,1), (1,2)],
    'cv__stop_words'  : ['english', None],
    'tf__use_idf': [True, False], #if True, acts like TFIDF, if False, acts like CountVectorizer
    'knn__n_neighbors' : [15, 25, 30]
}

In [14]:
# Loop through all my token sets doing a gridsearch on each
knn_models = {}
for tokens in word_sets:
    
    gs = GridSearchCV(estimator = knn_pipe,
                           param_grid = knn_params,
                           cv = 5,
                           n_jobs = 6)
    gs.fit(X_train[tokens],y_train);
    
    knn_models[tokens] = gs
    print(f'Score for {tokens}:   {gs.best_score_}')

Score for words:   0.6852302835211093
Score for lem:   0.6884806652635055
Score for stem:   0.6890916594445132


### Random Forest

In [15]:
# This pipeline is for: Random Forest Classifier
rf_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('tf', TfidfTransformer()),
    ('rf', RandomForestClassifier())
])

rf_params = {
    'cv__max_features': [100, 500],
    'cv__ngram_range' : [(1,1), (1,2)],
    'cv__stop_words'  : ['english', None],
    'tf__use_idf': [True, False], #if True, acts like TFIDF, if False, acts like CountVectorizer
    'rf__max_depth' : [None, 5],
    'rf__n_estimators' : [50, 100]
}

In [16]:
# Loop through all my token sets doing a gridsearch on each
rf_models = {}
for tokens in word_sets:
    
    gs = GridSearchCV(estimator = rf_pipe,
                           param_grid = rf_params,
                           cv = 5,
                           n_jobs = 6)
    gs.fit(X_train[tokens],y_train);
    
    rf_models[tokens] = gs
    print(f'Score for {tokens}:   {gs.best_score_}')

Score for words:   0.7096087656308034
Score for lem:   0.7096102100614915
Score for stem:   0.7085939498988898


### Bagging Classifier

In [17]:
# This pipeline is for: Bagging Classifier
bag_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('tf', TfidfTransformer()),
    ('bag',  BaggingClassifier())
])

bag_params = {
    'cv__max_features': [100, 500],
    'cv__ngram_range' : [(1,1), (1,2)],
    'cv__stop_words'  : ['english', None],
    'tf__use_idf': [True, False], #if True, acts like TFIDF, if False, acts like CountVectorizer
    'bag__base_estimator' : [None, LogisticRegression(max_iter=2000)],
    'bag__n_estimators' : [20]
}

In [18]:
# Loop through all my token sets doing a gridsearch on each
bag_models = {}
for tokens in word_sets:
    
    gs = GridSearchCV(estimator = bag_pipe,
                           param_grid = bag_params,
                           cv = 5,
                           n_jobs = 6)
    gs.fit(X_train[tokens],y_train);
    
    bag_models[tokens] = gs
    print(f'Score for {tokens}:   {gs.best_score_}')

Score for words:   0.7167229994634972
Score for lem:   0.7207868020304569
Score for stem:   0.7185504106310099


### Support Vector Classifier

In [19]:
# This pipeline is for: LinearSVC
svc_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('tf', TfidfTransformer()),
    ('svc',  SVC())
])

svc_params = {
    'cv__max_features': [500],
    'cv__ngram_range' : [(1,2)],
    'cv__stop_words'  : ['english', None],
    'tf__use_idf': [True, False], #if True, acts like TFIDF, if False, acts like CountVectorizer
    'svc__C' : np.logspace(-3,1,3),
    'svc__kernel' : ['linear','rbf','polynomial','sigmoid']
}

In [20]:
# Loop through all my token sets doing a gridsearch on each
svc_models = {}
for tokens in word_sets:
    
    gs = GridSearchCV(estimator = svc_pipe,
                           param_grid = svc_params,
                           cv = 5,
                           n_jobs = 6)
    gs.fit(X_train[tokens],y_train);
    
    svc_models[tokens] = gs
    print(f'Score for {tokens}:   {gs.best_score_}')

Score for words:   0.7075791341669763
Score for lem:   0.7104244562750195
Score for stem:   0.7130690025174363


## Examine Specificity and Sensitivity

Here I will check the sensitivity and specificity.

In [21]:
# Generate predictions with the best performing Model
prediction = log_models['lem'].predict(X_test['lem'])

In [22]:
# Create a confusion matrix.
tn, fp, fn, tp = confusion_matrix(y_test, prediction).ravel()

print(confusion_matrix(y_test, prediction))

[[405 223]
 [133 470]]


In [23]:
# Calculate Sensitivity
recall_score(y_test, prediction)

0.7794361525704809

In [24]:
# Calculate Specificity
specificity = tn / (tn + fp)
specificity

0.6449044585987261

I see that the false positives and false negatives are relatively similar. If there was an imbalance that might have been a sign that the data had some sort of additional relationship.