In [1031]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import defaultdict

In [1032]:
# load data
data = pd.read_csv('reddit_data.csv')
data.head()

Unnamed: 0,title,body,class
0,[AITA] I wrote an explanation in TIL and came ...,[Here is the post in question](http://www.redd...,True
1,[AITA] Threw my parent's donuts away,"My parents are diabetic, morbidly obese, and a...",True
2,I told a goth girl she looked like a clown.,I was four.,False
3,AITA Had a disagreement about Les Miserables w...,I love the musical *Les Miserables*. A coworke...,True
4,"[AITA] I 'hacked' our house router, changed th...",Backstory: I'm a semi-professional competitive...,True


For the naive bayes classifier, we will aggregate text from title and body of submission.

In [1033]:
text = [title + ' ' + body for title, body in zip( data['title'], data['body'] )]
data['text'] = text
data = data.drop(['title', 'body'], axis=1)
data.head()

Unnamed: 0,class,text
0,True,[AITA] I wrote an explanation in TIL and came ...
1,True,[AITA] Threw my parent's donuts away My parent...
2,False,I told a goth girl she looked like a clown. I ...
3,True,AITA Had a disagreement about Les Miserables w...
4,True,"[AITA] I 'hacked' our house router, changed th..."


We also want to drop the shortest records from the dataset. Since less than 10% of all records are less than 500 characters long, we'll drop those.

In [1034]:
print(data.shape)
data['text_len'] = [len(text) for text in data['text']]
print(data['text_len'].mean())
data = data[data['text_len'] > 500]
print(data['text_len'].mean())
data = data.drop(['text_len'], axis=1)
data.shape

(2431, 2)
2007.0255039078568
2098.5321180555557


(2304, 2)

## Preprocessing
Using NLTK modules, we can remove stopwords (very common words that contribute little semantic content), remove punctuation, remove grammatical endings, and turn every string into a vector of tokens.

In [1035]:
stop_words = set(stopwords.words('english'))
stop_words.add('aita') # common tag found in title
lemmatizer = WordNetLemmatizer() # turns words into their base form, e.g. ran -> run
tokenizer = RegexpTokenizer(r"\w+") # remove punctuation and tokenize

def preprocess(string):
    str_lower = string.lower()
    str_no_underscore = str_lower.replace('_', ' ')
    # remove numbers
    str_no_num = re.sub(r'\d+', '', str_no_underscore)
    # tokenize and remove stopwords
    tokens = tokenizer.tokenize(str_no_num)
    tokens = [w for w in tokens if w not in stop_words]
    # stem words
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return tokens

data['text'] = [preprocess(text) for text in data['text']]
data.head()

Unnamed: 0,class,text
0,True,"[wrote, explanation, til, came, condescending,..."
1,True,"[threw, parent, donut, away, parent, diabetic,..."
3,True,"[disagreement, le, miserables, coworker, love,..."
4,True,"[hacked, house, router, changed, password, tur..."
5,True,"[atia, permanently, give, customer, decaf, nam..."


## Removing Hyper-Uncommon Words

For our model to function best, we should remove any words that only appear in a few records. Most words appear in less than 10 documents, and so are considered noise. We'll also make a "sparse" dataset that only includes words that appear in at least 100 documents.

In [1036]:
all_word_counts = defaultdict(list)

for index, row in data.iterrows():
    text = row['text']
    for word in text:
        all_word_counts[word].append(index)

uncommon = {word: count for word, count in all_word_counts.items() if len(count) <= 10}
sparse_uncommon = {word: count for word, count in all_word_counts.items() if len(count) <= 100}
print(len(all_word_counts))
print('uncommon:', len(uncommon))
print('sparse_uncommon:', len(uncommon))

16741
uncommon: 13121
sparse_uncommon: 13121


Now we can go back to the records where these words appeared and remove them. Fortunately, it looks like no records were rendered empty by this.

In [1037]:
sparse_data = data.copy()
# pandas doesn't copy recursively
sparse_data['text'] = [text.copy() for text in data['text']]

for word, indices in uncommon.items():
    for index in indices:
        data.at[index, 'text'].remove(word)
print('data before:', data.shape)
data = data[[bool(x) for x in data['text']]]
print('data after:', data.shape)


for word, indices in sparse_uncommon.items():
    for index in indices:
        sparse_data.at[index, 'text'].remove(word)
print('sparse data before:',sparse_data.shape)
data = data[[bool(x) for x in data['text']]]
print('sparse data after:',sparse_data.shape)

data before: (2304, 2)
data after: (2304, 2)
sparse data before: (2304, 2)
sparse data after: (2304, 2)


## Vectorizing Word Counts
In order to perform Naive Bayes, we need to turn each record's text into a vector of word counts.

In [1038]:
# turn each list of tokens into a string to match input CountVectorizer expects
data['text'] = [' '.join(sentence) for sentence in data['text']]
sparse_data['text'] = [' '.join(sentence) for sentence in sparse_data['text']]

count_vec = CountVectorizer()
text_counts = count_vec.fit_transform(data['text'])
sparse_text_counts = count_vec.fit_transform(sparse_data['text'])
text_counts.shape

(2304, 3603)

We then can turn the raw counts into counts of Term Frequency time inverse document frequency.
This divides each word count by the total words in the document, and adjusts weights for words
that occur in all documents.

In [1039]:
tfidf_transformer = TfidfTransformer()
transformed_counts = tfidf_transformer.fit_transform(text_counts)
sparse_transformed_counts = tfidf_transformer.fit_transform(sparse_text_counts)
transformed_counts.shape

(2304, 3603)

## Balancing with SMOTE

Since we see a large class imbalance, we can run SMOTE to oversample the True class.

In [1040]:
y = data['class']
x = transformed_counts

y_sparse = sparse_data['class']
x_sparse = sparse_transformed_counts

x_unbalanced, y_unbalanced = shuffle(x, y)
x_sparse_unbalanced, y_sparse_unbalanced = shuffle(x_sparse, y_sparse)

print(transformed_counts.shape)
print(list(y_unbalanced).count(True))
sm = SMOTE('minority')
x_balanced, y_balanced = sm.fit_resample(transformed_counts, y)
x_sparse_balanced, y_sparse_balanced = sm.fit_resample(sparse_transformed_counts, y_sparse)
print(x_balanced.shape)
print(list(y_balanced).count(True))

(2304, 3603)
630
(3348, 3603)
1674


# Creating the Model

First we will try a Multinomial Naïve Bayes model.

In [1041]:
mnb = MultinomialNB()
param_grid = {
    'alpha': [i/10 for i in range(5,12,2)],
    'fit_prior': (False, True),
}
# balanced data
gridcv = GridSearchCV(mnb, param_grid)
gridcv.fit(x_balanced, y_balanced)
print(gridcv.best_params_)
print(gridcv.best_score_)

{'alpha': 0.5, 'fit_prior': False}
0.7195988666532807


It's accuracy and recall scores are fair.

In [1044]:
# balanced data
y_pred = cross_val_predict(gridcv, x_balanced, y_balanced, cv=10)
report = classification_report(y_balanced, y_pred)
print(report)

precision    recall  f1-score   support

       False       0.78      0.66      0.71      1674
        True       0.70      0.81      0.75      1674

    accuracy                           0.74      3348
   macro avg       0.74      0.74      0.73      3348
weighted avg       0.74      0.74      0.73      3348



Running the same model on unbalanced data gives us much worse results. Since the False class represents 73% of the data, the model vastly over predicts the 'False' label, giving it a fair overall accuracy but almost 0% recall of the True class.

In [1045]:
# unbalanced data
gridcv.fit(x_unbalanced, y_unbalanced)
print(gridcv.best_params_)
print(gridcv.best_score_)
y_pred = cross_val_predict(gridcv, x_unbalanced, y_unbalanced, cv=10)
report = classification_report(y_unbalanced, y_pred)
print(report)

{'alpha': 0.5, 'fit_prior': True}
0.726562293690465
              precision    recall  f1-score   support

       False       0.73      1.00      0.84      1674
        True       0.00      0.00      0.00       630

    accuracy                           0.73      2304
   macro avg       0.36      0.50      0.42      2304
weighted avg       0.53      0.73      0.61      2304



The sparse data set gives us similar results.

In [1046]:
# sparse balanced data
gridcv.fit(x_sparse_balanced, y_sparse_balanced)
print(gridcv.best_params_)
print(gridcv.best_score_)
y_pred = cross_val_predict(gridcv, x_balanced, y_balanced, cv=10)
report = classification_report(y_balanced, y_pred)
print(report)

{'alpha': 0.7, 'fit_prior': False}
0.613855386743413
              precision    recall  f1-score   support

       False       0.78      0.66      0.71      1674
        True       0.70      0.81      0.75      1674

    accuracy                           0.74      3348
   macro avg       0.74      0.74      0.73      3348
weighted avg       0.74      0.74      0.73      3348



In [1047]:
# sparse unbalanced data
gridcv = GridSearchCV(mnb, param_grid)
gridcv.fit(x_sparse_unbalanced, y_sparse_unbalanced)
print(gridcv.best_params_)
print(gridcv.best_score_)
y_pred = cross_val_predict(gridcv, x_sparse_unbalanced, y_sparse_unbalanced, cv=10)
report = classification_report(y_sparse_unbalanced, y_pred)
print(report)

{'alpha': 0.5, 'fit_prior': True}
0.726562293690465
              precision    recall  f1-score   support

       False       0.73      1.00      0.84      1674
        True       0.00      0.00      0.00       630

    accuracy                           0.73      2304
   macro avg       0.36      0.50      0.42      2304
weighted avg       0.53      0.73      0.61      2304



We also trained a Complement Naïve Bayes model. Scikitlearn documentation indicates the Complement Naïve Bayes models tend to outperform Multinomial models in text classification tasks, and in particular are better at handling unbalanced data.
However, it performed about on par with the Multinomial Naïve Bayes with balanced and unbalanced data.

In [1048]:
cnb = ComplementNB()
param_grid = {
    'alpha': [i/10 for i in range(5,12,2)],
    'fit_prior': (False, True),
    'norm': (False, True)
}
gridcv = GridSearchCV(cnb, param_grid)
# with balanced data
gridcv.fit(x_balanced, y_balanced)
print(gridcv.best_params_)
print(gridcv.best_score_)
y_pred = cross_val_predict(gridcv, x_balanced, y_balanced, cv=10)
report = classification_report(y_balanced, y_pred)
print(report)

{'alpha': 0.5, 'fit_prior': False, 'norm': False}
0.7195988666532807
              precision    recall  f1-score   support

       False       0.78      0.66      0.71      1674
        True       0.70      0.81      0.75      1674

    accuracy                           0.74      3348
   macro avg       0.74      0.74      0.73      3348
weighted avg       0.74      0.74      0.73      3348



In [1049]:
# with unbalanced data
gridcv.fit(x_unbalanced, y_unbalanced)
print(gridcv.best_params_)
print(gridcv.best_score_)
y_pred = cross_val_predict(gridcv, x_unbalanced, y_unbalanced, cv=10)
report = classification_report(y_unbalanced, y_pred)
print(report)

{'alpha': 0.9, 'fit_prior': False, 'norm': True}
0.7261256248231632
              precision    recall  f1-score   support

       False       0.73      0.99      0.84      1674
        True       0.25      0.00      0.01       630

    accuracy                           0.72      2304
   macro avg       0.49      0.50      0.42      2304
weighted avg       0.60      0.72      0.61      2304



Interestingly, though the CNB's performance on sparse balanced data is a little worse than that of MNB, it has a non-zero recall of 'False' on sparse *unbalanced* data.

In [1050]:
# sparse balanced data
gridcv.fit(x_sparse_balanced, y_sparse_balanced)
print(gridcv.best_params_)
print(gridcv.best_score_)
y_pred = cross_val_predict(gridcv, x_sparse_balanced, y_sparse_balanced, cv=10)
report = classification_report(y_sparse_balanced, y_pred)
print(report)

{'alpha': 0.7, 'fit_prior': False, 'norm': False}
0.613855386743413
              precision    recall  f1-score   support

       False       0.63      0.57      0.60      1674
        True       0.61      0.66      0.63      1674

    accuracy                           0.62      3348
   macro avg       0.62      0.62      0.62      3348
weighted avg       0.62      0.62      0.62      3348



In [1051]:
# sparse unbalanced data
gridcv.fit(x_sparse_unbalanced, y_sparse_unbalanced)
print(gridcv.best_params_)
print(gridcv.best_score_)
y_pred = cross_val_predict(gridcv, x_sparse_unbalanced, y_sparse_unbalanced, cv=10)
report = classification_report(y_sparse_unbalanced, y_pred)
print(report)

{'alpha': 1.1, 'fit_prior': False, 'norm': True}
0.6436659436008677
              precision    recall  f1-score   support

       False       0.75      0.74      0.74      1674
        True       0.33      0.35      0.34       630

    accuracy                           0.63      2304
   macro avg       0.54      0.54      0.54      2304
weighted avg       0.64      0.63      0.63      2304



This is the only model that does more than predict 'False' overwhelmingly when trained on unbalanced data.

Overall, our scores could have likely been improved with more data. Text classification is typically very data intensive, and we had to throw out the vast majority of unique words in order to achieve any results.