In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import gzip
import tarfile
import glob
import os
import sklearn
import numpy
import pandas as pd
from sklearn.utils import resample
import keras
!pip install keras_metrics
import keras_metrics
from keras.preprocessing.text import Tokenizer
from keras.layers import LSTM, Dense, Embedding, Bidirectional, Dropout, Conv1D, GlobalMaxPool1D, GlobalAveragePooling1D
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import f1_score, classification_report

Collecting keras_metrics
  Downloading https://files.pythonhosted.org/packages/32/c9/a87420da8e73de944e63a8e9cdcfb1f03ca31a7c4cdcdbd45d2cdf13275a/keras_metrics-1.1.0-py2.py3-none-any.whl
Installing collected packages: keras-metrics
Successfully installed keras-metrics-1.1.0


In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/CIS 530 Project/Milestone 3/train_data.csv').dropna().drop_duplicates()
val_data = pd.read_csv('/content/drive/MyDrive/CIS 530 Project/Milestone 3/val_data.csv').dropna().drop_duplicates()
test_data = pd.read_csv('/content/drive/MyDrive/CIS 530 Project/Milestone 3/test_data.csv').dropna().drop_duplicates()
datasets = [train_data, val_data, test_data]

In [None]:
# Downsample non-controversial to balance classes
train_non_data = train_data[train_data['contro'] == 0]
train_con_data = train_data[train_data['contro'] == 1]
train_non_balanced = resample(train_non_data, replace=False, n_samples = len(train_con_data))
combined = pd.concat([train_con_data, train_non_balanced])
train_data = combined.sample(frac=1).reset_index(drop=True)
train_data = train_data.drop(columns=['Unnamed: 0'])

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
lemmatizer = WordNetLemmatizer()
def lemmatize_word(word):
  return lemmatizer.lemmatize(word)

words_to_ignore = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
def clean_str(s):
  tokenized_words = tokenizer.tokenize(s)
  tokenized_words = [word for word in tokenized_words if word not in words_to_ignore]
  tokenized_words = [lemmatize_word(word) for word in tokenized_words]
  return ' '.join(tokenized_words)

In [None]:
for dataset in datasets:
  dataset['comment_text'] = dataset['comment_text'].apply(lambda text: clean_str(text))

In [None]:
max_features = 80000
keras_tokenizer = Tokenizer(num_words=max_features)
keras_tokenizer.fit_on_texts(train_data['comment_text'].to_list())
tokenized_training_data = keras_tokenizer.texts_to_sequences(train_data['comment_text'].to_list())
tokenized_validation_data = keras_tokenizer.texts_to_sequences(val_data['comment_text'].to_list())
tokenized_testing_data = keras_tokenizer.texts_to_sequences(test_data['comment_text'].to_list())
train_labels = numpy.array(train_data['contro'].to_list())
val_labels = numpy.array(val_data['contro'].to_list())
test_labels = numpy.array(test_data['contro'].to_list())

In [None]:
# Padding
max_len = 300
pad_train = pad_sequences(tokenized_training_data, maxlen=max_len)
pad_val = pad_sequences(tokenized_validation_data, maxlen=max_len)
pad_test = pad_sequences(tokenized_testing_data, maxlen=max_len)

In [None]:
# The model
embedding = 256
model = Sequential()
model.add(Embedding(max_features, embedding, input_length=max_len))
model.add(Dropout(.15))
model.add(Bidirectional(LSTM(128, return_sequences = True)))
model.add(Dropout(.2))
model.add(Bidirectional(LSTM(64, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dropout(.15))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])

In [None]:
epochs = 2
model.fit(pad_train, train_labels, epochs=epochs, validation_data=(pad_val, val_labels))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f9f48d7c350>

In [None]:
predictions = model.predict_classes([pad_test], batch_size=128)
predictions_confidence = model.predict([pad_test], batch_size=128)
predictions = [x for [x] in predictions]
predictions_confidence = [x for [x] in predictions_confidence]
print(predictions_confidence)
print(predictions)
target_names = ['non-contro', 'contro']
print(classification_report(test_labels, predictions, target_names=target_names))



[0.0025650703, 0.00422428, 0.0010008095, 0.0010549406, 0.010414558, 0.0027232273, 0.0015118382, 0.00096770015, 0.002232127, 0.99970585, 0.0020380332, 0.0069253114, 0.0016067299, 0.99851876, 0.002222361, 0.0026033497, 0.024194773, 0.032977104, 0.015129676, 0.01338973, 0.0044947616, 0.0010680172, 0.047987167, 0.01106406, 0.00092266797, 0.0022683719, 0.0038308473, 0.023315499, 0.0028266003, 0.13637388, 0.005609531, 0.0013820739, 0.0050406386, 0.052069105, 0.0027315302, 0.0024779886, 0.11364103, 0.03507637, 0.999572, 0.0035381431, 0.0011668974, 0.0032932095, 0.0015761994, 0.0041468893, 0.42429417, 0.0017907575, 0.0064267474, 0.14874241, 0.0031383396, 0.3412825, 0.0023242603, 0.0016585989, 0.04298273, 0.018979497, 0.022080734, 0.4564203, 0.12574333, 0.004082907, 0.015914366, 0.0038184626, 0.00145746, 0.6560963, 0.0056460593, 0.9855289, 0.0049628033, 0.9985637, 0.46081474, 0.0024493167, 0.008669127, 0.0012579975, 0.4218112, 0.33182856, 0.005291238, 0.2936266, 0.010262487, 0.0013735016, 0.335

In [None]:
actual = test_labels
# 5 incorrect predictions on either side

fp_count = 0
# indices of FP
fp_indices = []
for i in range(len(predictions)):
  if fp_count == 100:
    break
  if predictions[i] == 1 and actual[i] == 0:
    fp_indices.append(i)
    fp_count += 1

fn_count = 0
# indices of FN
fn_indices = []
for i in range(len(predictions)):
  if fn_count == 100:
    break
  if predictions[i] == 0 and actual[i] == 1:
    fn_indices.append(i)
    fn_count += 1

fp_comments = []
fn_comments = []

for i in range(5):
  fp_comments.append(test_data.iloc[fp_indices[i]]['comment_text'])
  fn_comments.append(test_data.iloc[fn_indices[i]]['comment_text'])

print('Comments Identified as Controversial that were Actually Labeled as Non-Controversial:')
for num, comment in enumerate(fp_comments[:5]):
  print(num, comment)
print()
print('Comments Identified as Non-Controversial that were Actually Labeled as Controversial:')
for num, comment in enumerate(fn_comments[:5]):
  print(num, comment)

print(len(fp_indices))
print(len(fn_indices))

Comments Identified as Controversial that were Actually Labeled as Non-Controversial:
0 He appear bit fantasist To fair carved nice little career fool mug buy book believe say But like many people contributing page I come background whenever name mentioned response always Dave No one ever heard book started coming
1 Please unblock delete page I request unblock talk page immediately permanently delete content You free permanently block contributing Wikipedia I intention case If would like reason block prove I violated rule need preserve talk page Here another deliberate intentionally provocative violation rule old school personal insult You loathsome sanctimonious hypocritical ignorant twit If state uncontrolled combustion I would take trouble extinguish flame urinating I hope satisfactory
2 I really know sure I talked Wikipedia IRC He say violate WP FUC I sure
3 Abuse admin power Being discriminatory unnaceptable
4 Homosexuality Hi everyone I would like come admit homosexuality Cheers 