# Separate branch where Title and Post Text are combined for analysis and classification

In [1]:
# Data cleaning/handling
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# NLP specific libraries
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction import stop_words
import re
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Modeling Libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, auc, roc_curve

### Bring in Dataset of Posts scraped from Reddit

In [2]:
df = pd.read_csv('./reddit_posts.csv')

In [3]:
df['subreddit'].value_counts()

geocaching      1000
IWantToLearn    1000
Name: subreddit, dtype: int64

## Text Cleaning Tools

In [4]:
# using regular expressions to remove punctuation in a function
def no_punct(string):
    return re.sub("[.,😯?😊!’\";^+`:*'()-@”“=>_$&<~%|{}\[\]]", " ", string)

In [5]:
# Create a function to clean any column or to feed into a word vectorizer as an analyzer parameter
def clean_func(column):
    
    #remove puntuation with punctuation removal function
    column = no_punct(column)
    
    #lowercase
    column = column.lower()
    
    return column

In [6]:
# Function that utilizes lemmatizing and a general Regex to remove punctuation

def preprocess(text):
    # instantiate lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # lowercase
    text = text.lower()
    
    # words only Regex, removes punctuation
    text = re.sub("[^A-Za-z]", " ", text)
    
    # lemmatize
    text = lemmatizer.lemmatize(text)
    
    return text

## Data cleaning and Exploratory Data Analysis

In [7]:
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

In [8]:
df.shape

(1995, 3)

In [9]:
df.describe().T

Unnamed: 0,count,unique,top,freq
text,1364,1364,I've always procrastinated. I think this is be...,1
title,1995,1989,IWTL how to sleep on my back,2
subreddit,1995,2,geocaching,999


In [10]:
df.groupby('subreddit').describe().T

Unnamed: 0,subreddit,IWantToLearn,geocaching
text,count,890,474
text,unique,890,474
text,top,I've always procrastinated. I think this is be...,There is a missing Geocache near my house (GC...
text,freq,1,1
title,count,996,999
title,unique,990,999
title,top,I want to learn to speak intellectually.,Favorite Geocaching social media accounts to f...
title,freq,2,1


#### Examining duplicate Titles

In [11]:
df['title'].value_counts().sort_values(ascending=False).head(7)

IWTL how to sleep on my back                                    2
IWTL how to meditate                                            2
I want to learn how to sing                                     2
IWTL how to play the piano                                      2
I want to learn to speak intellectually.                        2
IWTL how to improve my logical thinking and problem solving.    2
IWTL about the electric bikes/cars and how they work?           1
Name: title, dtype: int64

In [12]:
df.drop(1979, axis='index', inplace=True)
df.drop(1049, axis='index', inplace=True)

df.reset_index(drop=True, inplace=True)

In [13]:
# Check to see that the two duplicates were removed.
df.shape

(1993, 3)

With the removal of duplicates, the new dataframe is 1993 posts long.

In [14]:
# Create boolean masks to examine geocaching and IWantToLearn subreddits separately.
geocaching = df['subreddit'] == 'geocaching'
iwtl = df['subreddit'] == 'IWantToLearn'

#### Filling null values

In [15]:
# Fill empty text posts with 'NA'
df.fillna('NA', inplace=True)

#### Since I want to predict a binary variable - subreddit `0` for geocaching and `1` for IWantToLearn.

In [16]:
y = df['subreddit'].map({'geocaching': 0, 'IWantToLearn': 1})

# Natural Language Processing

### Master stopword list

In [17]:
# Create a master stopword list from both stopword lists
custom_stopwords = list(set(stopwords.words('english') + list(stop_words.ENGLISH_STOP_WORDS)))

# Add 'na' because it indicates an empty text post
custom_stopwords.extend(['na'])

In [18]:
# Create a stopword list that only takes out 'na'
no_na = ['na']

# Create Corpus

In [19]:
corpus = df['title'] + ' ' + df['text']

## Train-Test Split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, random_state=42)

## TF-IDF

In [21]:
pipeline = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('model', MultinomialNB())
])

In [22]:
params = {
    'tvec__stop_words': ['english', custom_stopwords],
    'tvec__analyzer': ['word', preprocess],
    'tvec__max_df': [250, 500, 750],
    'tvec__min_df': [1, 2, 3],
    'tvec__ngram_range': [(1, 1), (1, 3)]
}

In [23]:
gs = GridSearchCV(pipeline, param_grid=params, cv=5)
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tvec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...e,
        vocabulary=None)), ('model', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'tvec__stop_words': ['english', ['at', 'everywhere', 'whereby', 'latter', 'otherwise', 'anything', "hasn't", 'nowhere', 'detail', 'sixty', 'who', 'not', 'which', 'mightn', 'noone', 'through', 'hers', 'full', 'whereupon', 'he', "didn't", 'un', 'former', 'won', 'nobody', 'whenever', 'per',..., 'tvec__max_df': [250, 500, 750], 'tvec__min_df': [1, 2, 3], 'tvec__ngram_range': [(1, 1), (1, 3)]},
       pre_dispatch='2*n_jobs', refit=True, return_tra

In [24]:
gs.best_score_

0.9812583668005355

In [25]:
gs.best_params_

{'tvec__analyzer': 'word',
 'tvec__max_df': 500,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': 'english'}

In [28]:
best_tvec = TfidfVectorizer(analyzer='word', max_df=500, ngram_range=(1,1), min_df= 1, stop_words='english')

In [29]:
# Fit and transform the training set
best_tv = best_tvec.fit_transform(X_train)

# Make into dataframe for modeling
X_train_t = pd.DataFrame(best_tv.todense(), columns=best_tvec.vocabulary_)

In [30]:
# Transform the Testing Text
best_tv2 = best_tvec.transform(X_test)

# Make into dataframe for modeling
X_test_t = pd.DataFrame(best_tv2.todense(), columns=best_tvec.vocabulary_)

## Count Vectorizer

In [32]:
pipeline = Pipeline([
    ('cvec', CountVectorizer()),
    ('model', MultinomialNB())
])

In [33]:
params = {
    'cvec__stop_words': ['english', custom_stopwords],
    'cvec__analyzer': ['word', preprocess],
    'cvec__max_df': [250, 500, 750],
    'cvec__min_df': [1, 2, 3],
    'cvec__ngram_range': [(1, 1), (1, 3)]
}

In [34]:
gs2 = GridSearchCV(pipeline, param_grid=params, cv=5)
gs2.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor...zer=None, vocabulary=None)), ('model', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'cvec__stop_words': ['english', ['at', 'everywhere', 'whereby', 'latter', 'otherwise', 'anything', "hasn't", 'nowhere', 'detail', 'sixty', 'who', 'not', 'which', 'mightn', 'noone', 'through', 'hers', 'full', 'whereupon', 'he', "didn't", 'un', 'former', 'won', 'nobody', 'whenever', 'per',..., 'cvec__max_df': [250, 500, 750], 'cvec__min_df': [1, 2, 3], 'cvec__ngram_range': [(1, 1), (1, 3)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, 

In [35]:
gs2.best_score_

0.9886211512717537

In [36]:
gs2.best_params_

{'cvec__analyzer': 'word',
 'cvec__max_df': 500,
 'cvec__min_df': 3,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english'}

In [37]:
best_cvec = CountVectorizer(analyzer='word', max_df=500, ngram_range=(1,1), min_df= 3, stop_words='english')

In [38]:
# Fit and transform the training set
best_c = best_cvec.fit_transform(X_train)

# Make into dataframe for modeling
X_train_c = pd.DataFrame(best_c.todense(), columns=best_cvec.vocabulary_)

In [39]:
# Transform the Testing Text
best_c2 = best_cvec.transform(X_test)

# Make into dataframe for modeling
X_test_c = pd.DataFrame(best_c2.todense(), columns=best_cvec.vocabulary_)

# Model Pipeline

### Dictionary of Hyperparameters

In [51]:
estimators = {

    # Random Forest
    'rf': {
        'estimator': RandomForestClassifier(),
        'params': {
            'rf__random_state': [42],
            'rf__n_estimators': [9, 10],
            #'rf__max_features': [2500, 5000],
            #'rf__criterion': ['gini', 'entropy']
        }
    },

    # Logistic Regression
    'lr': {
        'estimator': LogisticRegression(),
        'params': {
            'lr__random_state': [42],
            #'lr__C': [0.8, 0.9, 1.0],
        }
    },

    # Multinomial Naive-Bayes
    'multinb': {
        'estimator': MultinomialNB(),
        'params': {
            #'multinb__alpha': [0.01, 0.1, 0.5]
        }
    }
}

In [52]:
# Make an empty dataframe of fitted models 
fitted_cvec_models = {}

# Loop through models in pipelines, tuning each one with its parameter dictionary
for step, config in estimators.items():
    pipe = Pipeline(
        steps = [
            (step, config['estimator'])
        ]
    )
    # Create GridSearch object for each model
    model = GridSearchCV(pipe, param_grid=config['params'], cv=5, n_jobs=-1, verbose=1)
    print('Running GrideSearch for Estimator ', step)
    
    # Fit each model and store it in the fitted models dataframe along with score
    fitted_cvec_models[step] = model.fit(X_train_c, y_train)
    
    # Indicators on progress
    print('Done fitting: ', step)
    print('--------------------')

print('GridSearch complete')

Running GrideSearch for Estimator  rf
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.1s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.1s finished


Done fitting:  rf
--------------------
Running GrideSearch for Estimator  lr
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.4s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.8s finished


Done fitting:  lr
--------------------
Running GrideSearch for Estimator  multinb
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.6s remaining:    2.4s


Done fitting:  multinb
--------------------
GridSearch complete


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.9s finished


In [53]:
for model in fitted_cvec_models:
    print(model)
    print('Training data score: ', fitted_cvec_models[model].best_score_)
    best_model = fitted_cvec_models[model].best_estimator_
    print('Testing data score:  ', best_model.score(X_test_c, y_test))
    print('Best Hyperparameters:', fitted_cvec_models[model].best_params_)
    print('')

rf
Training data score:  0.9163319946452476
Testing data score:   0.9158316633266533
Best Hyperparameters: {'rf__n_estimators': 10, 'rf__random_state': 42}

lr
Training data score:  0.9404283801874164
Testing data score:   0.935871743486974
Best Hyperparameters: {'lr__random_state': 42}

multinb
Training data score:  0.9571619812583668
Testing data score:   0.9739478957915831
Best Hyperparameters: {}



### GridSearch on TF-IDF corpus

In [54]:
# Make an empty dataframe of fitted models 
fitted_tvec_models = {}

# Loop through models in pipelines, tuning each one with its parameter dictionary
for step, config in estimators.items():
    pipe = Pipeline(
        steps = [
            (step, config['estimator'])
        ]
    )
    # Create GridSearch object for each model
    model = GridSearchCV(pipe, param_grid=config['params'], cv=5, n_jobs=-1, verbose=1)
    print('Running GrideSearch for Estimator ', step)
    
    # Fit each model and store it in the fitted models dataframe along with score
    fitted_tvec_models[step] = model.fit(X_train_t, y_train)
    
    # Indicators on progress
    print('Done fitting: ', step)
    print('--------------------')

print('GridSearch complete')

Running GrideSearch for Estimator  rf
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.9s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.5s finished


Done fitting:  rf
--------------------
Running GrideSearch for Estimator  lr
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.6s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.1s finished


Done fitting:  lr
--------------------
Running GrideSearch for Estimator  multinb
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.5s remaining:    2.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.8s finished


Done fitting:  multinb
--------------------
GridSearch complete


In [55]:
for model in fitted_tvec_models:
    print(model)
    print('Training data score: ', fitted_tvec_models[model].best_score_)
    best_model = fitted_tvec_models[model].best_estimator_
    print('Testing data score:  ', best_model.score(X_test_t, y_test))
    print('Best Hyperparameters:', fitted_tvec_models[model].best_params_)
    print('')

rf
Training data score:  0.9103078982597055
Testing data score:   0.9178356713426854
Best Hyperparameters: {'rf__n_estimators': 10, 'rf__random_state': 42}

lr
Training data score:  0.9471218206157965
Testing data score:   0.9378757515030061
Best Hyperparameters: {'lr__random_state': 42}

multinb
Training data score:  0.963186077643909
Testing data score:   0.9719438877755511
Best Hyperparameters: {}

