In [1]:
# !pip install tensorflow

Collecting tensorflow
  Using cached tensorflow-2.7.0-cp39-cp39-macosx_10_11_x86_64.whl (207.1 MB)
Collecting h5py>=2.9.0
  Using cached h5py-3.6.0-cp39-cp39-macosx_10_9_x86_64.whl (3.1 MB)
Collecting termcolor>=1.1.0
  Using cached termcolor-1.1.0-py3-none-any.whl
Collecting libclang>=9.0.1
  Using cached libclang-12.0.0-py2.py3-none-macosx_10_9_x86_64.whl (12.2 MB)
Collecting tensorflow-estimator<2.8,~=2.7.0rc0
  Using cached tensorflow_estimator-2.7.0-py2.py3-none-any.whl (463 kB)
Collecting wheel<1.0,>=0.32.0
  Using cached wheel-0.37.0-py2.py3-none-any.whl (35 kB)
Collecting keras-preprocessing>=1.1.1
  Using cached Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
Collecting typing-extensions>=3.6.6
  Using cached typing_extensions-4.0.1-py3-none-any.whl (22 kB)
Collecting flatbuffers<3.0,>=1.12
  Using cached flatbuffers-2.0-py2.py3-none-any.whl (26 kB)
Collecting absl-py>=0.4.0
  Using cached absl_py-1.0.0-py3-none-any.whl (126 kB)
Collecting opt-einsum>=2.3.2
  Using cach

In [71]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np 
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = stopwords.words('english')
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

tf.random.set_seed(1234)

import string
table = str.maketrans('', '', string.punctuation)

In [72]:
twitter_df = pd.read_csv("data/clean/git_twitter.csv", index_col = "Unnamed: 0")
reddit_df = pd.read_csv("data/clean/reddit.csv", index_col = "Unnamed: 0")
reddit_df = reddit_df.dropna()

In [73]:
twitter_df['Data'] = twitter_df['Data'].str.lower()
reddit_df['Data'] = reddit_df['Data'].str.lower()
twitter_df['Data'] = twitter_df['Data'].apply(lambda x: ' '.join([word.translate(table) for word in x.split()]))
twitter_df['Data'] = twitter_df['Data'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
twitter_df['Data'] = twitter_df['Data'].apply(lambda x: ' '.join([porter.stem(word) for word in x.split()]))
reddit_df['Data'] = reddit_df['Data'].apply(lambda x: ' '.join([word.translate(table) for word in x.split()]))
reddit_df['Data'] = reddit_df['Data'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
reddit_df['Data'] = reddit_df['Data'].apply(lambda x: ' '.join([porter.stem(word) for word in x.split()]))

In [42]:
(train, test) = train_test_split(reddit_df, test_size=0.2, random_state=42, shuffle=True)
(train, val) = train_test_split(train, test_size=0.2, random_state=42, shuffle=True)

In [45]:
train_sentences = train['Data'].to_numpy()
test_sentences = test['Data'].to_numpy()
val_sentences = val['Data'].to_numpy()

train_labels = train['Label'].to_numpy()
test_labels = test['Label'].to_numpy()
val_labels = val['Label'].to_numpy()

In [48]:
vocab_size = 10000
oov_token = "<oov>"

tokeniser = Tokenizer(num_words = vocab_size,oov_token = oov_token)
tokeniser.fit_on_texts(train_sentences)
word_index = tokeniser.word_index
sequences = tokeniser.texts_to_sequences(train_sentences)
padding = pad_sequences(sequences,maxlen=120,truncating='post')

val_sequences = tokeniser.texts_to_sequences(val_sentences)
val_padded = pad_sequences(val_sequences,maxlen=120,truncating='post')

testing_sequences = tokeniser.texts_to_sequences(test_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=120,truncating='post')

In [74]:
simple_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size,16,input_length=120),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units = 10,activation="relu"),
    tf.keras.layers.Dense(units = 1, activation="sigmoid")
])

In [75]:
simple_model.compile(loss="binary_crossentropy",optimizer="adam",metrics=['accuracy'])
simple_model.fit(padding,train_labels,epochs = 10,validation_data=(val_padded,val_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x12d234ca0>

In [76]:
output = simple_model.evaluate(testing_padded,  test_labels, verbose=2)
print("The simple model gives us an accuracy of: ", output[1])

107/107 - 0s - loss: 0.6538 - accuracy: 0.8525 - 146ms/epoch - 1ms/step


[0.6538158059120178, 0.8525169491767883]

In [81]:
rnn_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size,16,input_length=120),
    tf.keras.layers.SimpleRNN(units = 6, dropout=0.3, activation="tanh"),
    tf.keras.layers.Dense(units = 1, activation="sigmoid")
])

In [82]:
rnn_model.compile(loss="binary_crossentropy",optimizer="adam",metrics=['accuracy'])
rnn_model.fit(padding,train_labels,epochs = 10,validation_data=(val_padded,val_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x12d777730>

In [83]:
output = rnn_model.evaluate(testing_padded,  test_labels, verbose=2)
print("The Baseline RNN model gives us an accuracy of: ", output[1])

107/107 - 1s - loss: 0.4302 - accuracy: 0.8728 - 583ms/epoch - 5ms/step


[0.43023544549942017, 0.8728289604187012]

In [77]:
lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size,16,input_length=120),
    tf.keras.layers.LSTM(units = 6, dropout=0.3, activation="tanh"),
    tf.keras.layers.Dense(units = 1, activation="sigmoid")
])

In [78]:
lstm_model.compile(loss="binary_crossentropy",optimizer="adam",metrics=['accuracy'])
lstm_model.fit(padding,train_labels,epochs = 10,validation_data=(val_padded,val_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x12c9f2c10>

In [79]:
output = lstm_model.evaluate(testing_padded,  test_labels, verbose=2)
print("The LSTM model gives us an accuracy of: ", output[1])

107/107 - 1s - loss: 0.4122 - accuracy: 0.8752 - 914ms/epoch - 9ms/step


[0.41222134232521057, 0.8751839995384216]

In [84]:
bi_lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size,16,input_length=120),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units = 6, dropout=0.3, activation="tanh")),
    tf.keras.layers.Dense(units = 1, activation="sigmoid")
])

In [85]:
bi_lstm_model.compile(loss="binary_crossentropy",optimizer="adam",metrics=['accuracy'])
bi_lstm_model.fit(padding,train_labels,epochs = 10,validation_data=(val_padded,val_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x12dbc5250>

In [86]:
output = bi_lstm_model.evaluate(testing_padded,  test_labels, verbose=2)
print("The BI-LSTM model gives us an accuracy of: ", output[1])

107/107 - 1s - loss: 0.3872 - accuracy: 0.8893 - 1s/epoch - 10ms/step
The BI-LSTM model gives us an accuracy of:  0.8893141150474548


In [90]:
gru_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size,16,input_length=120),
    tf.keras.layers.GRU(units = 6, dropout=0.3, activation="tanh"),
    tf.keras.layers.Dense(units = 1, activation="sigmoid")
])

In [91]:
gru_model.compile(loss="binary_crossentropy",optimizer="adam",metrics=['accuracy'])
gru_model.fit(padding,train_labels,epochs = 10,validation_data=(val_padded,val_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x12e8d3910>

In [92]:
output = gru_model.evaluate(testing_padded,  test_labels, verbose=2)
print("The GRU model gives us an accuracy of: ", output[1])

107/107 - 1s - loss: 0.3840 - accuracy: 0.8722 - 1s/epoch - 11ms/step
The GRU model gives us an accuracy of:  0.8722401857376099
