In [2]:
import numpy as np
import pandas as pd

In [3]:
import re

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
snow=SnowballStemmer("english")
def snowball_tokens2(text):
    text_processed = re.sub(r'[^A-Za-z]', ' ', text).split()
    tokens = [snow.stem(word) for word in text_processed]
    return tokens

In [6]:
canada_df = pd.read_csv('..\data\canada_subreddit_comments.csv')

In [7]:
custom_stopwords= stopwords.words('english')
custom_stopwords.extend(['people', 'like', 'canada'])
custom_stopwords = [snow.stem(word) for word in custom_stopwords]
#I'm cheating and adding a few more stopwords here that I identfied as highly shared between both subreddits
#that I didn't identify until after doing some more analysis on top tokens

extra_stopwords=['get', 'would', 'gt', 'one', 'go', 'make', 
                 'actual', 'also', 'back', 'us', 'use', 'could', 'say', 'said', 'see', 'back', 'come',
                'canadian', 'look']

custom_stopwords.extend(extra_stopwords)

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, Conv1D
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import optimizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [106]:
X=canada_df['body_processed']
y=canada_df['subreddit_bin']

In [107]:
comments=[]

for comment in X:
    token_list=[x for x in snowball_tokens2(comment) if x not in custom_stopwords]
    comments.append(" ".join(token_list))

In [108]:
X_train, X_test, y_train, y_test = train_test_split(comments, y, random_state=42, test_size=0.2, stratify=y)

In [12]:
print(len(X_train), len(X_test))

16081 4021


In [177]:
vocab_size=4000 #max number of words
embedding_dim=32 #word vector dimension
max_length= 50 # max length of sequence (sentence)
trunc_type = 'post' #where to truncate if over max length (this cuts all vals after 200)
padding_type = "post" # where to add padding -- this adds padding to end
oov_tok='<00V>'

In [178]:
tokenizer=Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

In [179]:
train_sequence=tokenizer.texts_to_sequences(X_train)
train_padded=pad_sequences(train_sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [180]:
test_sequence= tokenizer.texts_to_sequences(X_test)
test_padded=pad_sequences(test_sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [196]:
model= Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(LSTM(4, activity_regularizer=regularizers.l2(5)))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_48"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_48 (Embedding)     (None, 50, 32)            128000    
_________________________________________________________________
lstm_51 (LSTM)               (None, 4)                 592       
_________________________________________________________________
dense_46 (Dense)             (None, 1)                 5         
Total params: 128,597
Trainable params: 128,597
Non-trainable params: 0
_________________________________________________________________


In [198]:
opt=optimizers.Adam(learning_rate=0.001)
model.compile(loss = 'binary_crossentropy', optimizer=opt, metrics=['acc'])

In [67]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import optimizers

In [134]:
early_stop=EarlyStopping(monitor='val_acc', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_acc', patience=5, factor=0.25)

In [199]:

model.fit(train_padded, y_train, validation_data=(test_padded, y_test), batch_size=512, epochs=1000, callbacks=[early_stop, reduce_lr])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000


<tensorflow.python.keras.callbacks.History at 0x212f57845e0>