# Part 3: Modelling, Evaluation and Conclusion
***

### Import packages

In [1]:
import pandas as pd
import regex as re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import time

import collections
from collections import Counter

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

### Combine Datasets

In [2]:
# Load dataframes
anxiety_cleaned = pd.read_csv('./data/anxiety_cleaned.csv')
depression_cleaned = pd.read_csv('./data/depression_cleaned.csv')
both = pd.read_csv('./data/raw_cleaned.csv') # combined both raw data collected and cleaned data at each stage for both subreddits

In [3]:
# View top rows of separate cleaned datasets
display(anxiety_cleaned.head())
display(depression_cleaned.head())

Unnamed: 0,subreddit,post,post_cleaned,post_tokenised,post_no_stop,post_lemmatised,post_string
0,Anxiety,Coping We’re moving out of state and away from...,Coping We’re moving out of state and away from...,"['coping', 'we', 're', 'moving', 'out', 'of', ...","['coping', 'moving', 'state', 'away', 'everyon...",coping moving state away everyone know sunday ...,coping moving state away everyone know sunday ...
1,Anxiety,"Starting a new job anxiety As the title says, ...",Starting a new job anxiety As the title says I...,"['starting', 'a', 'new', 'job', 'anxiety', 'as...","['starting', 'new', 'job', 'anxiety', 'title',...",starting new job anxiety title say starting ne...,starting new job anxiety title say starting ne...
2,Anxiety,anxiety affecting studies Lately I've been fee...,anxiety affecting studies Lately Ive been feel...,"['anxiety', 'affecting', 'studies', 'lately', ...","['anxiety', 'affecting', 'studies', 'lately', ...",anxiety affecting study lately feeling anxious...,anxiety affecting study lately feeling anxious...
3,Anxiety,This is my recovery story since falling for th...,This is my recovery story since falling for th...,"['this', 'is', 'my', 'recovery', 'story', 'sin...","['recovery', 'story', 'since', 'falling', 'pin...",recovery story since falling pin code scam cam...,recovery story since falling pin code scam cam...
4,Anxiety,After years wearing braces my teeth still look...,After years wearing braces my teeth still look...,"['after', 'years', 'wearing', 'braces', 'my', ...","['years', 'wearing', 'braces', 'teeth', 'still...",year wearing brace teeth still look fucked dup...,year wearing brace teeth still look fucked dup...


Unnamed: 0,subreddit,post,post_cleaned,post_tokenised,post_no_stop,post_lemmatised,post_string
0,depression,Lack of cooperation from people: being left ou...,Lack of cooperation from people being left out...,"['lack', 'of', 'cooperation', 'from', 'people'...","['lack', 'cooperation', 'people', 'left', 'peo...",lack cooperation people left people want get k...,lack cooperation people left people want get k...
1,depression,"I hate to sleep I can’t sleep, and I haven’t b...",I hate to sleep I can’t sleep and I haven’t be...,"['i', 'hate', 'to', 'sleep', 'i', 'can', 't', ...","['hate', 'sleep', 'sleep', 'sleeping', 'long',...",hate sleep sleep sleeping long time night trie...,hate sleep sleep sleeping long time night trie...
2,depression,Just another day folks! Let’s fucking wrap it ...,Just another day folks Let’s fucking wrap it u...,"['just', 'another', 'day', 'folks', 'let', 's'...","['another', 'day', 'folks', 'let', 'fucking', ...",another day folk let fucking wrap hour fuck,another day folk let fucking wrap hour fuck
3,depression,My house is a sad place It’s been made for bus...,My house is a sad place It’s been made for bus...,"['my', 'house', 'is', 'a', 'sad', 'place', 'it...","['house', 'sad', 'place', 'made', 'business', ...",house sad place made business efficiency leavi...,house sad place made business efficiency leavi...
4,depression,I just want to die I just hate my life. I need...,I just want to die I just hate my life I need ...,"['i', 'just', 'want', 'to', 'die', 'i', 'just'...","['want', 'die', 'hate', 'life', 'need', 'numb'...",want die hate life need numb act constantly ke...,want die hate life need numb act constantly ke...


In [4]:
# Merge the 'anxiety_cleaned' and 'depression_cleaned' dataset
combined_cleaned = pd.concat([anxiety_cleaned, depression_cleaned])

In [5]:
# Check the number of rows and columns in the cleaned dataset
print("\"combined_cleaned\" dataset:")
print(f"(Rows, Column): {combined_cleaned.shape}")
print("")
# Check the number of rows and columns in the raw dataset
print("\"both\" dataset:")
print(f"(Rows, Column): {both.shape}")

"combined_cleaned" dataset:
(Rows, Column): (5337, 7)

"both" dataset:
(Rows, Column): (5337, 6)


The total number of rows is 5337. As the newly merged dataframe with cleaned datasets ('combined_cleaned') contain same total number of rows as the merged dataframe with raw datasets ('raw_cleaned'), this means that there has been no loss of data from the cleaning and pre-processing phases.

In [6]:
# View sample rows of the merged cleaned dataset
display(combined_cleaned.head())
display(combined_cleaned.tail())

Unnamed: 0,subreddit,post,post_cleaned,post_tokenised,post_no_stop,post_lemmatised,post_string
0,Anxiety,Coping We’re moving out of state and away from...,Coping We’re moving out of state and away from...,"['coping', 'we', 're', 'moving', 'out', 'of', ...","['coping', 'moving', 'state', 'away', 'everyon...",coping moving state away everyone know sunday ...,coping moving state away everyone know sunday ...
1,Anxiety,"Starting a new job anxiety As the title says, ...",Starting a new job anxiety As the title says I...,"['starting', 'a', 'new', 'job', 'anxiety', 'as...","['starting', 'new', 'job', 'anxiety', 'title',...",starting new job anxiety title say starting ne...,starting new job anxiety title say starting ne...
2,Anxiety,anxiety affecting studies Lately I've been fee...,anxiety affecting studies Lately Ive been feel...,"['anxiety', 'affecting', 'studies', 'lately', ...","['anxiety', 'affecting', 'studies', 'lately', ...",anxiety affecting study lately feeling anxious...,anxiety affecting study lately feeling anxious...
3,Anxiety,This is my recovery story since falling for th...,This is my recovery story since falling for th...,"['this', 'is', 'my', 'recovery', 'story', 'sin...","['recovery', 'story', 'since', 'falling', 'pin...",recovery story since falling pin code scam cam...,recovery story since falling pin code scam cam...
4,Anxiety,After years wearing braces my teeth still look...,After years wearing braces my teeth still look...,"['after', 'years', 'wearing', 'braces', 'my', ...","['years', 'wearing', 'braces', 'teeth', 'still...",year wearing brace teeth still look fucked dup...,year wearing brace teeth still look fucked dup...


Unnamed: 0,subreddit,post,post_cleaned,post_tokenised,post_no_stop,post_lemmatised,post_string
2469,depression,"20,M. My “best years” are being wasted and I b...",M My “best years” are being wasted and I blame...,"['m', 'my', 'best', 'years', 'are', 'being', '...","['best', 'years', 'wasted', 'blame', 'depressi...",best year wasted blame depression time see cha...,best year wasted blame depression time see cha...
2470,depression,"I want to die I just want to die, I dont even ...",I want to die I just want to die I dont even w...,"['i', 'want', 'to', 'die', 'i', 'just', 'want'...","['want', 'die', 'want', 'die', 'dont', 'want',...",want die want die dont want talk anymore want ...,want die want die dont want talk anymore want ...
2471,depression,Can someone please just reassure me that it'll...,Can someone please just reassure me that itll ...,"['can', 'someone', 'please', 'just', 'reassure...","['someone', 'please', 'reassure', 'itll', 'ok'...",someone please reassure itll ok british girl g...,someone please reassure itll ok british girl g...
2472,depression,Tired I remember what it feels like to be happ...,Tired I remember what it feels like to be happ...,"['tired', 'i', 'remember', 'what', 'it', 'feel...","['tired', 'remember', 'feels', 'happy', 'actua...",tired remember feel happy actually look forwar...,tired remember feel happy actually look forwar...
2473,depression,Corona has ruined everything I’m almost positi...,Corona has ruined everything I’m almost positi...,"['corona', 'has', 'ruined', 'everything', 'i',...","['corona', 'ruined', 'everything', 'almost', '...",corona ruined everything almost positive lost ...,corona ruined everything almost positive lost ...


### Convert Subreddit Labels to Binary

In [7]:
# Assigns number value to subreddit name (0 for anxiety, 1 for depression)
combined_cleaned['subreddit'] = combined_cleaned['subreddit'].map({'Anxiety': 0, 'depression': 1})

In [8]:
display(combined_cleaned.head())
display(combined_cleaned.tail())

Unnamed: 0,subreddit,post,post_cleaned,post_tokenised,post_no_stop,post_lemmatised,post_string
0,0,Coping We’re moving out of state and away from...,Coping We’re moving out of state and away from...,"['coping', 'we', 're', 'moving', 'out', 'of', ...","['coping', 'moving', 'state', 'away', 'everyon...",coping moving state away everyone know sunday ...,coping moving state away everyone know sunday ...
1,0,"Starting a new job anxiety As the title says, ...",Starting a new job anxiety As the title says I...,"['starting', 'a', 'new', 'job', 'anxiety', 'as...","['starting', 'new', 'job', 'anxiety', 'title',...",starting new job anxiety title say starting ne...,starting new job anxiety title say starting ne...
2,0,anxiety affecting studies Lately I've been fee...,anxiety affecting studies Lately Ive been feel...,"['anxiety', 'affecting', 'studies', 'lately', ...","['anxiety', 'affecting', 'studies', 'lately', ...",anxiety affecting study lately feeling anxious...,anxiety affecting study lately feeling anxious...
3,0,This is my recovery story since falling for th...,This is my recovery story since falling for th...,"['this', 'is', 'my', 'recovery', 'story', 'sin...","['recovery', 'story', 'since', 'falling', 'pin...",recovery story since falling pin code scam cam...,recovery story since falling pin code scam cam...
4,0,After years wearing braces my teeth still look...,After years wearing braces my teeth still look...,"['after', 'years', 'wearing', 'braces', 'my', ...","['years', 'wearing', 'braces', 'teeth', 'still...",year wearing brace teeth still look fucked dup...,year wearing brace teeth still look fucked dup...


Unnamed: 0,subreddit,post,post_cleaned,post_tokenised,post_no_stop,post_lemmatised,post_string
2469,1,"20,M. My “best years” are being wasted and I b...",M My “best years” are being wasted and I blame...,"['m', 'my', 'best', 'years', 'are', 'being', '...","['best', 'years', 'wasted', 'blame', 'depressi...",best year wasted blame depression time see cha...,best year wasted blame depression time see cha...
2470,1,"I want to die I just want to die, I dont even ...",I want to die I just want to die I dont even w...,"['i', 'want', 'to', 'die', 'i', 'just', 'want'...","['want', 'die', 'want', 'die', 'dont', 'want',...",want die want die dont want talk anymore want ...,want die want die dont want talk anymore want ...
2471,1,Can someone please just reassure me that it'll...,Can someone please just reassure me that itll ...,"['can', 'someone', 'please', 'just', 'reassure...","['someone', 'please', 'reassure', 'itll', 'ok'...",someone please reassure itll ok british girl g...,someone please reassure itll ok british girl g...
2472,1,Tired I remember what it feels like to be happ...,Tired I remember what it feels like to be happ...,"['tired', 'i', 'remember', 'what', 'it', 'feel...","['tired', 'remember', 'feels', 'happy', 'actua...",tired remember feel happy actually look forwar...,tired remember feel happy actually look forwar...
2473,1,Corona has ruined everything I’m almost positi...,Corona has ruined everything I’m almost positi...,"['corona', 'has', 'ruined', 'everything', 'i',...","['corona', 'ruined', 'everything', 'almost', '...",corona ruined everything almost positive lost ...,corona ruined everything almost positive lost ...


### Define X and y

In [9]:
# Assign x and y values
X = combined_cleaned['post_cleaned']
y = combined_cleaned['subreddit']

### Baseline score

In [10]:
# Ensures the class is balanced
y.value_counts(normalize=True)

0    0.536444
1    0.463556
Name: subreddit, dtype: float64

In [11]:
baseline_score = y.value_counts(normalize = True)
print(f'Baseline Score: {baseline_score[0]}')

Baseline Score: 0.5364436949597152


### Do train-test split

In [12]:
# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify=y, 
                                                    test_size=0.25,
                                                    random_state=42)

In [13]:
# View how many rows are in train set and test set respectively
print('X_train', X_train.shape)
print('X_test', X_test.shape)

X_train (4002,)
X_test (1335,)


In [14]:
train = pd.merge(X_train, y_train, left_index = True, right_index = True)

test = pd.merge(X_test, y_test, left_index = True, right_index = True)

train.to_csv("./data/train.csv", index = False)
test.to_csv("./data/test.csv", index = False)

### Vectorise words with CountVectoriser and TF-IDF Vectoriser

In [15]:
cvec = CountVectorizer()

X_train_cvec = cvec.fit_transform(X_train)
X_test_cvec = cvec.transform(X_test)

print('X_train_cvec', X_train_cvec.shape)
print('X_test_cvec', X_test_cvec.shape)

tvec = TfidfVectorizer()

X_train_tvec = tvec.fit_transform(X_train)
X_test_tvec = tvec.transform(X_test)

print('X_train_tvec', X_train_tvec.shape)
print('X_test_tvec', X_test_tvec.shape)

X_train_cvec (4002, 17245)
X_test_cvec (1335, 17245)
X_train_tvec (4002, 17245)
X_test_tvec (1335, 17245)


### Baseline models

In [16]:
nb = MultinomialNB()
print('Train', cross_val_score(nb, X_train_cvec, y_train, cv=5).mean())

Train 0.8528202247191011


In [17]:
lr = LogisticRegression(max_iter=200)
print('Train', cross_val_score(lr, X_train_cvec, y_train, cv=5).mean())

Train 0.859322097378277


### Modeling

In [18]:
# vectorizer hyperparameters involving ngram, stop word removal, and feature numbers
vec_params_features = {
    "vec__ngram_range": [(1, 1), (1, 2), (1, 3)], 
    "vec__stop_words": [None, "english"], 
    "vec__max_features": [100, 300, 500, 700, 900]}

# vectorizer hyperparameters involving ngram, stop word removal, min document appearance, and max document appearance
vec_params_dfs = {
    "vec__ngram_range": [(1, 1), (1, 2), (1, 3)], 
    "vec__stop_words": [None, "english"], 
    "vec__min_df": [0.1, 0.2, 0.3], 
    "vec__max_df": [0.6, 0.7, 0.8, 0.9]
}

# vectorizer hyperparameters involving all hyperparameters as above, though with more limited options 
vec_params_all = {
    "vec__ngram_range": [(1, 1), (1, 2)], 
    "vec__stop_words": [None, "english"], 
    "vec__min_df": [0.1, 0.2], 
    "vec__max_df": [0.7, 0.8], 
    "vec__max_features": [300, 500, 700]
}

In [19]:
# dictionary to store model metrics in 
# will be transformed to DataFrame at end for easy visualization of performance differences
model_outcomes = {"Transformer": [], 
                  "Estimator": [], 
                  "Parameters": [],
                  "Best Parameters": [], 
                  "Best Score": [], 
                  "Training Score": [], 
                  "Test Score": [], 
                  "Discrepancy": [], 
                  "Runtime": []}

In [20]:
param_dict = {"df_params": vec_params_dfs, "features_params": vec_params_features, 
              "limited_all_params": vec_params_all}

In [21]:
def run(start, end): 
    long = end - start 
    minutes = int(long // 60 )
    seconds = int(round(long - 60 * minutes))
    return f"{minutes}m {seconds}s"

In [22]:
def gridsearch_batch(vectorizer, classifier, parameter_dict, outcomes_dict): 
    parameter_names = list(param_dict.keys())
    
    cycle = 0
    
    time_total = 0 
    for i in param_dict:
        pipe = Pipeline([
            ("vec", vectorizer), 
            ("class", classifier)
        ])
        
        grid = GridSearchCV(pipe, parameter_dict[i], cv = 5)
        
        start = time.time() 
        grid.fit(X_train, y_train)
        end = time.time()
        
        train = grid.score(X_train, y_train)
        test = grid.score(X_test, y_test)
        
        print(f"Model with {parameter_names[cycle]} took {run(start, end)} to run.")
        print(f"Best parameters: \n{grid.best_params_}")
        print(f"Best score: {grid.best_score_}")
        print(f"Training score: {train}")
        print(f"Test score: {test}")
        
        fill = [f"{vectorizer}", f"{classifier}", parameter_names[cycle], grid.best_params_, grid.best_score_, 
                train, test, (train - test), run(start, end)]
        
        count = 0
        for field in outcomes_dict: 
            outcomes_dict[field].append(fill[count])
            count += 1
        
        print("----------")
        
        cycle += 1
        time_total += (end - start)
    print(f"This entire process took {run(0, time_total)}")

In [None]:
### Count Vectoriser with Logistic Regression
gridsearch_batch(CountVectorizer(), LogisticRegression(solver='liblinear', class_weight='balanced'), param_dict, model_outcomes)

In [None]:
### Count Vectoriser with Multinomial Naive-Bayes
gridsearch_batch(CountVectorizer(), MultinomialNB(), param_dict, model_outcomes)

In [None]:
### TF-IDF Vectoriser with Logistic Regression

gridsearch_batch(TfidfVectorizer(), LogisticRegression(solver='liblinear', class_weight='balanced'), param_dict, model_outcomes)

In [None]:
### TF-IDF Vectoriser with Multinomial Naive-Bayes
gridsearch_batch(TfidfVectorizer(), MultinomialNB(), param_dict, model_outcomes)

### Evaluate Models

In [None]:
outcomes = pd.DataFrame(model_outcomes)

outcomes.sort_values(by = "Best Score", ascending = False, inplace = True)

outcomes.reset_index(inplace = True)

outcomes.drop(columns = ["index"], inplace = True)

outcomes

In [None]:
# pulling best parameters for the model that yielded a high accuracy score (91%) with a smaller discrepancy (2.2%)

outcomes["Best Parameters"][3]

### Run Finalised Model

In [None]:
pipe = Pipeline([
                ("vec", TfidfVectorizer(max_features = 900, 
                                        ngram_range = (1, 3), 
                                        stop_words = "english")),
                ("lr", LogisticRegression(solver = "liblinear"))
            ])

pipe.fit(X_train, y_train)

In [None]:
pipe.score(X_train, y_train)

In [None]:
pipe.score(X_test, y_test)

In [None]:
#build pipeline
  ##count vec params unigrams, bigrams, min/max df
  ## model params
#gridsearch cv (what kind of regularisation i want to do)
  #adjust regularisation param



In [None]:
# run models

In [None]:
# metrics
# simplest is accuracy--> only for data we have (TP+TN/all)
# recall -minimise errors TP/(TP+FN) if FN is 0 then recall =1
# take note which are the important tokens

In [None]:
#aoc .5 -1
#worst =.5

# Evaluation and Conceptual Understanding

- Does the student accurately identify and explain the baseline score?
- Does the student select and use metrics relevant to the problem objective?
- Does the student interpret the results of their model for purposes of inference?
- Is domain knowledge demonstrated when interpreting results?
- Does the student provide appropriate interpretation with regards to descriptive and inferential statistics?

# Conclusion and Recommendations

- Does the student provide appropriate context to connect individual steps back to the overall project?
- Is it clear how the final recommendations were reached?
- Are the conclusions/recommendations clearly stated?
- Does the conclusion answer the original problem statement?
- Does the student address how findings of this research can be applied for the benefit of stakeholders?
- Are future steps to move the project forward identified?