### Michael Li (ml5803) and Kaixuan Zhou (kz1005)
#### Text Classification : Toxic, Information, Sports, Religious, and Advertisment	


# Initialization and imports

In [0]:
#Let's ignore the warnings...
import warnings
warnings.simplefilter("ignore")

In [0]:
#Authenticate through Google Collab
from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

In [0]:
#import libraries

import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import re, string

from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

# Tweet parsing functions

In [0]:
def get_hashtags(text):
    sentences = text.split(".")
    words = [word for sentence in sentences for word in sentence.split()]
    all_hashtags = [word[1:] for word in words if word.startswith('#')]
    for tag in all_hashtags:
        text = text.replace("#" + tag, "")
    
    results = []
    for i in range(len(all_hashtags)):
        result = []
        for j in range(len(all_hashtags[i])):
            if not (all_hashtags[i][j] >= 'a' and all_hashtags[i][j] <= 'z') and not (all_hashtags[i][j] >= 'A' and all_hashtags[i][j] <= 'Z'):
                pass
            else:
                result.append(all_hashtags[i][j])
        results.append(''.join(result))
    return (';'.join(results), text)

def get_ats(text):
    sentences = text.split(".")
    words = [word for sentence in sentences for word in sentence.split()]
    all_hashtags = [word[1:] for word in words if word.startswith('@')]
    for tag in all_hashtags:
        text = text.replace("#" + tag, "")
    
    results = []
    for i in range(len(all_hashtags)):
        result = []
        for j in range(len(all_hashtags[i])):
            if not (all_hashtags[i][j] >= 'a' and all_hashtags[i][j] <= 'z') and not (all_hashtags[i][j] >= 'A' and all_hashtags[i][j] <= 'Z'):
                pass
            else:
                result.append(all_hashtags[i][j])
        results.append(''.join(result))
    return (';'.join(results), text)

def get_words(text):
    word_start = False
    start = 0
    result = []
    for index in range(len(text)):
	    char = text[index]
	    if not char.isalpha() and not word_start:
	        continue
	    else:
	        # we are reading a word
	        # update the start index
	        if not word_start:
	            word_start = True
	            start = index
	        # reach the end of a word
	        if not char.isalpha():
	            word_start = False
	            result.append(text[start:index])
    
    if word_start and start != len(text) - 1:
        result.append(text[start:len(text)])
    return " ".join(result)

def get_links(text):
    urls = re.findall(r'(https?://\S+)', text)
    for i in range(len(urls)):
        if urls[i][-1] == '.' or urls[i][-1] == '!' or urls[i][-1] == '?':
            urls[i] = urls[i][:len(urls[i]) - 1]
    for url in urls:
        text = text.replace(url, '')
    return (';'.join(urls), text)

In [0]:
def cleaned_text(data):
    persons, text = get_ats(data)
    hashtags, text = get_hashtags(text)
    hyper_links, text = get_links(text)
    cleaned_text = get_words(text)
    return cleaned_text


re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

# Code

Now that we have the libraries imported and helper functions written, we can start writing code.
<br />
Steps: <br />
1. Load data from Google Sheets
2. Read data into dataframe
3. Convert tweet text into TFID vector
4. Use TFID vector to create models
5. Cross Validation through KFolds

In [66]:
#Load data
worksheet = gc.open('output_copy').sheet1
records = 30000
rows = worksheet.get_all_values()
header = rows[0]
data = rows[1:records]
print(rows[0])

['Handle', 'TweetID', 'Tweet', 'Toxic', 'Information', 'Sports', 'Religious', 'Advertisment', 'Classification']


In [67]:
balanced_data = []
count = [0, 0, 0, 0, 0]
for tweet in data:
    count[int(tweet[-1])] += 1

new_count = [0, 0, 0, 0, 0]
min_count = min(count)
for tweet in data:
    if new_count[int(tweet[-1])] < min_count:
        balanced_data.append(tweet)
        new_count[int(tweet[-1])] += 1

data = balanced_data
print(len(data))

9610


In [68]:
#Read data into dataframe

df = pd.DataFrame(data, columns = header) 
#shuffle df
df = df.sample(frac= 1).reset_index(drop=True)
np_arr = np.array(df)

df

Unnamed: 0,Handle,TweetID,Tweet,Toxic,Information,Sports,Religious,Advertisment,Classification
0,ESPNCFB,1195828454316527616,RT @espn: HE SOARED FOR THIS INT 😲 https://t.c...,0,0,1,0,0,2
1,Forbes,1200826826043473920,Sometimes being selfish creates a win-win http...,0,1,0,0,0,1
2,museumofBible,1186427351313326080,"RT @KaeDeeDesigns: On November 7, an ancient J...",0,0,0,1,0,3
3,Nike,1179717929136345088,@inasuc The motor on those motorcycles doesn't...,0,0,0,0,1,4
4,Xbox,1199743850354479105,@TikaaniOfficial Can we say all of it is our f...,0,0,0,0,1,4
...,...,...,...,...,...,...,...,...,...
9605,NVIDIAGeForce,1191409282832424960,RT @theScoreesports: Congrats to the TEAM USA ...,0,0,0,0,1,4
9606,ReviewReligions,1187457542412210176,RT @alislam: New Release: Signs of the Living ...,0,0,0,1,0,3
9607,BR_NBA,1200592197390753797,Brandon Clarke 🤯\n\nWhat a finish off the lob....,0,0,1,0,0,2
9608,NFLonFOX,1198802138601639937,"""We don't flinch."" 🔒\n\n@ErinAndrews caught up...",0,0,1,0,0,2


In [69]:
df.describe()

Unnamed: 0,Handle,TweetID,Tweet,Toxic,Information,Sports,Religious,Advertisment,Classification
count,9610.0,9610.0,9610,9610,9610,9610,9610,9610,9610
unique,42.0,7690.0,9561,2,2,2,2,2,5
top,,,Here are some out-of-the box ideas to get stud...,0,0,0,0,0,4
freq,1021.0,1021.0,5,7688,7688,7688,7688,7688,1922


In [70]:
#data split into train and test
train_ind = int(len(data) // 1.5)
df_train = df[:train_ind]
df_test = df[train_ind:]
print(df_train.shape, df_test.shape)

(6406, 9) (3204, 9)


In [0]:
#Convert tweet to TFID vector
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, stop_words='english')
trn_term_doc = vec.fit_transform(df_train["Tweet"])
test_term_doc = vec.transform(df_test["Tweet"])

In [0]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)
    
x = trn_term_doc
test_x = test_term_doc

In [0]:
def get_mdl(y):
    y = y.values
    y = np.array([int(i) for i in y])
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [74]:
#Create models for each class
label_cols = ['Toxic', 'Information', 'Sports', 'Religious', 'Advertisment']
preds = np.zeros((len(df_test), len(label_cols)))
models = {}
for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(df_train[j])
    models[j] = (m, r)
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit Toxic
fit Information
fit Sports
fit Religious
fit Advertisment


In [75]:
models 

{'Advertisment': (LogisticRegression(C=4, class_weight=None, dual=True, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='warn', n_jobs=None, penalty='l2',
                     random_state=None, solver='warn', tol=0.0001, verbose=0,
                     warm_start=False),
  matrix([[ 0.77240749, -1.40170333, -0.5812276 , ...,  0.33689385,
            0.54977302,  0.54977302]])),
 'Information': (LogisticRegression(C=4, class_weight=None, dual=True, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='warn', n_jobs=None, penalty='l2',
                     random_state=None, solver='warn', tol=0.0001, verbose=0,
                     warm_start=False),
  matrix([[-0.78893625, -1.51967956, -0.83353182, ...,  0.06241355,
            0.55860347,  0.55860347]])),
 'Religious': (LogisticRegression(C=4, class_weight=None, dual=True, fit_intercept=Tr

In [76]:
#split into parts for cross validation
accuracy_scores = {}
for label in label_cols:
    accuracy_scores[label] = []

kf = KFold(n_splits=5)
iteration = 1
for train_index, test_index in kf.split(df):
    print("Iteration:", iteration)
    iteration += 1
    for model in models.keys():
        print(model)
        x_train, x_test = df["Tweet"][train_index], df["Tweet"][test_index]
        y_train, y_test = df[model][train_index], df[model][test_index]
        logreg = LogisticRegression(C=12.0)
        # logreg = models[model][0]
        logreg.fit(vec.transform(x_train),y_train)
        y_pred = logreg.predict(vec.transform(x_test))

        #scores
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        precision = precision_score(y_test, y_pred, average='macro')
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_scores[model].append(accuracy)
        print("\t Recall:", recall)
        print("\t F1 Score:", f1)
        print("\t Precision:", precision)
        print("\t Accuracy:", accuracy)
  

Iteration: 1
Toxic
	 Recall: 0.9776674937965261
	 F1 Score: 0.9856336540250352
	 Precision: 0.9941444372153545
	 Accuracy: 0.9906347554630593
Information
	 Recall: 0.8642245204450816
	 F1 Score: 0.8961346310448217
	 Precision: 0.9409962145006117
	 Accuracy: 0.9396462018730489
Sports
	 Recall: 0.9126517157488676
	 F1 Score: 0.9351967362352069
	 Precision: 0.9630669391695199
	 Accuracy: 0.9604578563995838
Religious
	 Recall: 0.907047779037133
	 F1 Score: 0.9334174646772078
	 Precision: 0.9672754062620407
	 Accuracy: 0.9599375650364204
Advertisment
	 Recall: 0.9023534897634193
	 F1 Score: 0.9264071932341165
	 Precision: 0.956387057628323
	 Accuracy: 0.9573361082206036
Iteration: 2
Toxic
	 Recall: 0.9817415730337078
	 F1 Score: 0.9886342287756072
	 Precision: 0.9958834705509816
	 Accuracy: 0.9932362122788762
Information
	 Recall: 0.8975020896290105
	 F1 Score: 0.9138670692966053
	 Precision: 0.9333283685395621
	 Accuracy: 0.9443288241415192
Sports
	 Recall: 0.9268929367511842
	 F1 Score: 0

In [77]:
#Evaluation
for score in accuracy_scores:
    print(score, ":", np.average(accuracy_scores[score]))

Toxic : 0.9908428720083247
Information : 0.943808532778356
Sports : 0.9616024973985432
Religious : 0.9511966701352756
Advertisment : 0.9569198751300728


# User Testing

Test your own sentences

In [0]:
def test(tweet):
    test_data = vec.transform([tweet])
    json = {}
    #model predicting: 1 is in that category, 0 is not
    for model in models.keys():
        result = models[model][0].predict(test_data.multiply(models[model][1]))
        json[model] = result[0]
    return json

Some sentences have been provided.

In [79]:
test("The Knicks scored 3-1 in the most recent game")

{'Advertisment': 0, 'Information': 0, 'Religious': 0, 'Sports': 1, 'Toxic': 0}

In [80]:
test("Fuck you stupid bitch")

{'Advertisment': 0, 'Information': 0, 'Religious': 0, 'Sports': 0, 'Toxic': 1}

In [81]:
test("fuck you")

{'Advertisment': 0, 'Information': 0, 'Religious': 0, 'Sports': 0, 'Toxic': 1}

In [82]:
test("fuck, just chill. Everything is ok")

{'Advertisment': 0, 'Information': 0, 'Religious': 0, 'Sports': 0, 'Toxic': 1}

In [83]:
test("we can win this game. Just follow me and throw the ball to that gate")

{'Advertisment': 0, 'Information': 0, 'Religious': 0, 'Sports': 0, 'Toxic': 0}

In [84]:
test("mom, i want to eat ice cream")

{'Advertisment': 0, 'Information': 0, 'Religious': 0, 'Sports': 0, 'Toxic': 0}

In [85]:
test("Theory says black holes like this shouldn’t get so big.")

{'Advertisment': 0, 'Information': 1, 'Religious': 0, 'Sports': 0, 'Toxic': 0}

In [86]:
test("Being a member of E-AHPBA brings many benefits including access to myHPB educational content covering over 50 key areas")

{'Advertisment': 0, 'Information': 0, 'Religious': 0, 'Sports': 0, 'Toxic': 0}