## Preprocessing and Modelling

In [1]:
# Standard Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Modelling Imports
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

# NLP Imports
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB



#### Read in Data

In [2]:
df = pd.read_csv('clean_reddit.csv')

In [3]:
# Check the first 5 rows
df.head()

Unnamed: 0,title,selftext,subreddit,alltexts,alltexts_word_count,alltexts_length
0,Score in S10E09,Anybody knows where to find the score of every...,1,Score in S10E09 Anybody knows where to find th...,25.0,153.0
1,Amc early access,When does early access for the episode go live...,1,Amc early access When does early access for th...,24.0,118.0
2,Why does TWD MTG cards logo features a shootin...,If you look at the new MTG TWD cards the logo ...,1,Why does TWD MTG cards logo features a shootin...,133.0,912.0
3,Is fear the walking dead worth watching?,I got into walking dead around 2012. All it to...,1,Is fear the walking dead worth watching? I got...,83.0,452.0
4,So idk if its just me hearing this but in S6 E...,lol,1,So idk if its just me hearing this but in S6 E...,34.0,156.0


In [12]:
df.shape

(2964, 6)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2964 entries, 0 to 2963
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   title                2964 non-null   object 
 1   selftext             2964 non-null   object 
 2   subreddit            2964 non-null   int64  
 3   alltexts             2964 non-null   object 
 4   alltexts_word_count  2964 non-null   float64
 5   alltexts_length      2964 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 139.1+ KB


At this point, there are no missing values. So we will go ahead and set up X and y for modelling.

#### Set up X and y

In [14]:
# We would be using the alltext since its a combination of the title and the selftext.
X = df['alltexts']
y = df['subreddit']

In [15]:
# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [16]:
X_train.shape

(2223,)

In [17]:
X_test.shape

(741,)

In [18]:
y_train.shape

(2223,)

In [19]:
y_test.shape

(741,)

In [20]:
# Set up function to calculate and display the classification metrics
# Courtesy of Eboni Lee
def class_metrics(model, X, y):
    # Generate the prediction
    preds = model.predict(X)
    # Get the confusion matrix and ravel
    tn, fp, fn, tp = confusion_matrix(y, preds).ravel()
    # Accuracy
    print(f' The Accuracy is :{round((tp + tn)/len(y), 3)}')
    # Sensitivity
    print(f' The Sensitivity is :{round(tp/(tp+fn), 3)}')
    # Specificity
    print(f' The Specificity is :{round(tn/(tn+fp), 3)}')
    # Precision
    print(f' The Precision is :{round(tp/(tp+fp), 3)}')


In [30]:
# Analyzers so that we can stick them in the pipelines
# Courtesy of stackoverflow

# PorterStemmer - CVEC
stemmer = PorterStemmer()
cvec_analyzer = CountVectorizer().build_analyzer()

def porter_cvec_words(doc):
    return (stemmer.stem(w) for w in cvec_analyzer(doc))

# PorterStemmer - TFIDF
tfidf_analyzer = TfidfVectorizer().build_analyzer()

def porter_tfidf_words(doc):
    return (stemmer.stem(w) for w in tfidf_analyzer(doc))

# WordNetLemmatizer - CVEC
lemm = WordNetLemmatizer()

def lemm_cvec_words(doc):
    return (lemm.lemmatize(w) for w in cvec_analyzer(doc))

# WordNetLemmatizer - TFIDF
def lemm_tfidf_words(doc):
    return (lemm.lemmatize(w) for w in tfidf_analyzer(doc))

## Baseline Model

In [22]:
from sklearn.dummy import DummyRegressor 
dr = DummyRegressor()
dr.fit(X_test, y_test)
dr.score(X_test, y_test)

0.0

## Logistic Regression Model

#### 1. Logistic Regression with CountVectorizer

In [27]:
# Set up the pipeline
c_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])

# Pipe parameters
c_pipe_params = {
    'cvec__max_features': [100, 500],
    'cvec__stop_words': [None, 'english'],
    'cvec__ngram_range': [(1, 1), (1, 2)],
    'cvec__analyzer': ['word', porter_cvec_words, lemm_cvec_words],
    'lr__C': [0.1, 1, 1e9],
    'lr__penalty': ['l1', 'l2']
}

# Instantiate a GridSearchCV
c_gs = GridSearchCV(c_pipe,
                   c_pipe_params,
                   cv=5,
                   n_jobs = -1)

# Fit
c_gs.fit(X_train, y_train)

# Show metrics and best parameters
print(c_gs.best_params_)
class_metrics(c_gs, X_test, y_test)

{'cvec__analyzer': 'word', 'cvec__max_features': 500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english', 'lr__C': 0.1, 'lr__penalty': 'l2'}
 The Accuracy is :0.942
 The Sensitivity is :0.967
 The Specificity is :0.915
 The Precision is :0.926


In [28]:
class_metrics(c_gs, X_train, y_train)

 The Accuracy is :0.975
 The Sensitivity is :0.991
 The Specificity is :0.956
 The Precision is :0.962


These results seem to be very good, the four classification metrics were all greater than 0.92 which is strong. The Sensitivity and Specificity are both similar so we are correctly classifying both subreddits at a similar accuracy. The difference in the training and testing score is 0.033, so there isn't significant overfitting. 

#### 2. Logistic Regression with TfidfVectorizer

In [31]:
# Using the same steps as above
lr_tf_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer=porter_tfidf_words)),
    ('lr', LogisticRegression())
])

# Pipe parameters
lr_tf_pipe_params = {
    'tfidf__max_features':[100, 500],
    'tfidf__stop_words':[None, 'english'],
    'tfidf__ngram_range':[(1,1), (1,2)],
    'lr__C':[0.1, 1, 1e9]
}

# Instantiate GridSearch for the tfidf
lr_tf_gs = GridSearchCV(lr_tf_pipe,
                        lr_tf_pipe_params,
                        cv=5,
                        n_jobs=-1)

# Fit the GridSearch
lr_tf_gs.fit(X_train, y_train);

# Display the metrics and best parameters
print(lr_tf_gs.best_params_)
class_metrics(lr_tf_gs, X_test, y_test)

{'lr__C': 1, 'tfidf__max_features': 500, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': None}
 The Accuracy is :0.946
 The Sensitivity is :0.949
 The Specificity is :0.943
 The Precision is :0.949


The LogisticRegression using TfidfVectorizer gave very similar results with the CountVectorizer using same model. And just like the CountVectorizer it did overfit slightly. However the only similarities in the parameters were the ngram_range and the max_features which were (1,1) and 500 respectively.

## Naive Bayes Model

#### 1. Multinomial Naive Bayes with CountVectorizer

In [34]:
# Set up the pipeline
mc_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('mnb', MultinomialNB())
])

# Pipe parameters
mc_pipe_params = {
    'cvec__max_features': [100, 500],
    'cvec__stop_words': [None, 'english'],
    'cvec__ngram_range': [(1, 1), (1, 2)],
    'cvec__analyzer': ['word', porter_cvec_words, lemm_cvec_words],
    'mnb__alpha':[0.1, 0.2, 0.5, 1]
}

# Instantiate a GridSearchCV
mc_gs = GridSearchCV(mc_pipe,
                   mc_pipe_params,
                   cv=5,
                   n_jobs = -1)

# Fit
mc_gs.fit(X_train, y_train)

# Show metrics and best parameters
print(mc_gs.best_params_)
class_metrics(mc_gs, X_test, y_test)

{'cvec__analyzer': 'word', 'cvec__max_features': 500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english', 'mnb__alpha': 0.1}
 The Accuracy is :0.957
 The Sensitivity is :0.974
 The Specificity is :0.938
 The Precision is :0.945


In [35]:
class_metrics(mc_gs, X_train, y_train)

 The Accuracy is :0.969
 The Sensitivity is :0.981
 The Specificity is :0.955
 The Precision is :0.96


#### 2. Multinomial Bayes Model with TfidfVectorizer

In [36]:
# Using the same steps as above
mnb_tf_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer=porter_tfidf_words)),
    ('mnb', MultinomialNB())
])

# Pipe parameters
mnb_tf_pipe_params = {
    'tfidf__max_features':[100, 500],
    'tfidf__stop_words':[None, 'english'],
    'tfidf__ngram_range':[(1,1), (1,2)],
    'mnb__alpha':[0.1, 0.2, 0.5, 1]
}

# Instantiate GridSearch for the tfidf
mnb_tf_gs = GridSearchCV(mnb_tf_pipe,
                        mnb_tf_pipe_params,
                        cv=5,
                        n_jobs=-1)

# Fit the GridSearch
mnb_tf_gs.fit(X_train, y_train);

# Display the metrics and best parameters
print(mnb_tf_gs.best_params_)
class_metrics(mnb_tf_gs, X_test, y_test)

{'mnb__alpha': 0.1, 'tfidf__max_features': 500, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': None}
 The Accuracy is :0.962
 The Sensitivity is :0.974
 The Specificity is :0.949
 The Precision is :0.955


In [37]:
class_metrics(mnb_tf_gs, X_train, y_train)

 The Accuracy is :0.97
 The Sensitivity is :0.981
 The Specificity is :0.958
 The Precision is :0.963
