### Michael Li (ml5803) and Kaixuan Zhou (kz1005) <br/>
### Neural Network
### Text Classification : Toxic, Information, Sports, Religious, and Advertisement

# Initialization and Imports


In [0]:
#Let's ignore the warnings...
import warnings
warnings.simplefilter("ignore")

In [0]:
from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

In [3]:
#import libraries

import pandas as pd, numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

worksheet = gc.open('output_copy').sheet1
records = 30000
rows = worksheet.get_all_values()
header = rows[0]
data = rows[1:records]

print(rows[0])

['Handle', 'TweetID', 'Tweet', 'Toxic', 'Information', 'Sports', 'Religious', 'Advertisement', 'Classification']


In [0]:
#Uncomment if you want the dataset to be balanced

# balanced_data = []
# count = [0, 0, 0, 0, 0]
# for tweet in data:
#     count[int(tweet[-1])] += 1

# new_count = [0, 0, 0, 0, 0]
# min_count = min(count)
# for tweet in data:
#     if new_count[int(tweet[-1])] < min_count:
#         balanced_data.append(tweet)
#         new_count[int(tweet[-1])] += 1

# data = balanced_data
# print(len(data))

In [5]:
df = pd.DataFrame(data, columns = header) 
#shuffle df
df = df.sample(frac= 1).reset_index(drop=True)

np_arr = np.array(df)

train_ind = int(len(data) // 1.5)
df_train = df[:train_ind]
df_test = df[train_ind:]
print(df_train.shape, df_test.shape)

(14076, 9) (7039, 9)


# Setup and helper functions

In [6]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
import tensorflow.keras.backend as K
K.clear_session()

import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

# Code for Neural Network

In [0]:
# Create mapping from word to vector

vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, stop_words='english')
trn_term_doc = vec.fit_transform(df_train["Tweet"])
test_term_doc = vec.transform(df_test["Tweet"])

In [8]:
#Initializing the neural network

nin = trn_term_doc.shape[1] # dimension of input data
nh = 100     # number of hidden units
nout = 5
model = Sequential()
model.add(Dense(units=nh, input_shape=(nin,), activation='sigmoid', name='hidden'))
model.add(Dense(units=nout, activation='softmax', name='output'))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hidden (Dense)               (None, 100)               1704500   
_________________________________________________________________
output (Dense)               (None, 5)                 505       
Total params: 1,705,005
Trainable params: 1,705,005
Non-trainable params: 0
_________________________________________________________________


In [0]:
from tensorflow.keras import optimizers

opt = optimizers.Adam(lr=0.001) # beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=opt,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

### Individual Neural Networks for classifying into topics - No Dropout.



In [0]:
# labels = ['Toxic', 'Information', 'Sports', 'Religious', 'Advertisement']
# info = {}

# vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
#               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
#               smooth_idf=1, sublinear_tf=1, stop_words='english')
# trn_term_doc = vec.fit_transform(df_train["Tweet"])
# test_term_doc = vec.transform(df_test["Tweet"])

# hists = {}
# for label in labels:
#   #reset model and create model per label
#   K.clear_session()
#   nin = trn_term_doc.shape[1] # dimension of input data
#   nh = 100  # number of hidden units
#   nout = 2 #either 1 or 0 for a label
#   model = Sequential()
#   model.add(Dense(units=nh, input_shape=(nin,), activation='sigmoid', name='hidden'))
#   model.add(Dense(units=nout, activation='softmax', name='output'))
#   opt = optimizers.Adam(lr=0.001) # beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
#   model.compile(optimizer=opt,
#                 loss='sparse_categorical_crossentropy',
#                 metrics=['accuracy'])

#   ytr = df_train[label]
#   yts = df_test[label]
#   hist = model.fit(trn_term_doc, ytr, epochs=30, batch_size=100, validation_data=(test_term_doc,yts))
#   hists[label] = hist
#   final_acc = hist.history['val_acc'][-1]
#   info[label] = final_acc


### Neural Network with Dropout

In [11]:
labels = ['Toxic', 'Information', 'Sports', 'Religious', 'Advertisement']
info = {}

vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
              min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
              smooth_idf=1, sublinear_tf=1, stop_words='english')
trn_term_doc = vec.fit_transform(df_train["Tweet"])
test_term_doc = vec.transform(df_test["Tweet"])

hists = {}
for label in labels:
  #reset model and create model per label
    K.clear_session()
    nin = trn_term_doc.shape[1] # dimension of input data
    nh = 100  # number of hidden units
    nout = 2 #either 1 or 0 for a label
    model = Sequential()
    model.add(Dense(units=nh, input_shape=(nin,), activation='sigmoid', name='hidden'))
    model.add(Dropout(0.5))
    model.add(Dense(units=nout, activation='softmax', name='output'))
    opt = optimizers.Adam(lr=0.001) # beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=opt,
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])

    ytr = df_train[label]
    yts = df_test[label]
    hist = model.fit(trn_term_doc, ytr, epochs=30, batch_size=100, validation_data=(test_term_doc,yts))
    hists[label] = hist
    final_acc = hist.history['val_acc'][-1]
    info[label] = final_acc

Train on 14076 samples, validate on 7039 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Train on 14076 samples, validate on 7039 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Train on 14076 samples, validate on 7039 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoc

In [12]:
#Accuracy

for elem in info:
    print(elem,":",info[elem])

Toxic : 0.993465
Information : 0.9239949
Sports : 0.9678932
Religious : 0.94217926
Advertisement : 0.95354456
