### Michael Li (ml5803) and Kaixuan Zhou (kz1005)
### Logistic Regression
#### Text Classification : Toxic, Information, Sports, Religious, and Advertisement	


# Initialization and imports

In [0]:
#Let's ignore the warnings...
import warnings
warnings.simplefilter("ignore")

In [0]:
#Authenticate through Google Collab
from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

In [0]:
#import libraries

import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import re, string

from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

# Tweet parsing functions

In [0]:
def get_hashtags(text):
    sentences = text.split(".")
    words = [word for sentence in sentences for word in sentence.split()]
    all_hashtags = [word[1:] for word in words if word.startswith('#')]
    for tag in all_hashtags:
        text = text.replace("#" + tag, "")
    
    results = []
    for i in range(len(all_hashtags)):
        result = []
        for j in range(len(all_hashtags[i])):
            if not (all_hashtags[i][j] >= 'a' and all_hashtags[i][j] <= 'z') and not (all_hashtags[i][j] >= 'A' and all_hashtags[i][j] <= 'Z'):
                pass
            else:
                result.append(all_hashtags[i][j])
        results.append(''.join(result))
    return (';'.join(results), text)

def get_ats(text):
    sentences = text.split(".")
    words = [word for sentence in sentences for word in sentence.split()]
    all_hashtags = [word[1:] for word in words if word.startswith('@')]
    for tag in all_hashtags:
        text = text.replace("#" + tag, "")
    
    results = []
    for i in range(len(all_hashtags)):
        result = []
        for j in range(len(all_hashtags[i])):
            if not (all_hashtags[i][j] >= 'a' and all_hashtags[i][j] <= 'z') and not (all_hashtags[i][j] >= 'A' and all_hashtags[i][j] <= 'Z'):
                pass
            else:
                result.append(all_hashtags[i][j])
        results.append(''.join(result))
    return (';'.join(results), text)

def get_words(text):
    word_start = False
    start = 0
    result = []
    for index in range(len(text)):
	    char = text[index]
	    if not char.isalpha() and not word_start:
	        continue
	    else:
	        # we are reading a word
	        # update the start index
	        if not word_start:
	            word_start = True
	            start = index
	        # reach the end of a word
	        if not char.isalpha():
	            word_start = False
	            result.append(text[start:index])
    
    if word_start and start != len(text) - 1:
        result.append(text[start:len(text)])
    return " ".join(result)

def get_links(text):
    urls = re.findall(r'(https?://\S+)', text)
    for i in range(len(urls)):
        if urls[i][-1] == '.' or urls[i][-1] == '!' or urls[i][-1] == '?':
            urls[i] = urls[i][:len(urls[i]) - 1]
    for url in urls:
        text = text.replace(url, '')
    return (';'.join(urls), text)

In [0]:
def cleaned_text(data):
    persons, text = get_ats(data)
    hashtags, text = get_hashtags(text)
    hyper_links, text = get_links(text)
    cleaned_text = get_words(text)
    return cleaned_text


re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

# Code

Now that we have the libraries imported and helper functions written, we can start writing code.
<br />
Steps: <br />
1. Load data from Google Sheets
2. Read data into dataframe
3. Convert tweet text into TFID vector
4. Use TFID vector to create models
5. Cross Validation through KFolds

In [6]:
#Load data
worksheet = gc.open('output_copy').sheet1
records = 30000
rows = worksheet.get_all_values()
header = rows[0]
data = rows[1:records]
print(rows[0])

['Handle', 'TweetID', 'Tweet', 'Toxic', 'Information', 'Sports', 'Religious', 'Advertisement', 'Classification']


In [0]:
# shuffle data
from random import shuffle
shuffle(data)

In [0]:
# uncomment for balanced dataset
# balanced_data = []
# count = [0, 0, 0, 0, 0]
# for tweet in data:
#     count[int(tweet[-1])] += 1

# new_count = [0, 0, 0, 0, 0]
# min_count = min(count)
# unused_data = []
# for tweet in data:
#     if new_count[int(tweet[-1])] < min_count:
#         balanced_data.append(tweet)
#         new_count[int(tweet[-1])] += 1
#     else:
#         unused_data.append(tweet)

# data = balanced_data
# unused_df = pd.DataFrame(unused_data, columns = header)
# print(len(data))

In [9]:
#Read data into dataframe

df = pd.DataFrame(data, columns = header)

df

Unnamed: 0,Handle,TweetID,Tweet,Toxic,Information,Sports,Religious,Advertisement,Classification
0,alislam,1186673652462284801,Keeping company with the righteous is another ...,0,0,0,1,0,3
1,Forbes,1200100777903214592,Retailers were thrilled about this. Gimbel Bro...,0,1,0,0,0,1
2,BR_NBA,1198068762710171648,RT @BleacherReport: AD AND BRON ACTIVATE SHOWT...,0,0,1,0,0,2
3,ReviewReligions,1186235191678902274,"Various religions trend in Singapore, includin...",0,0,0,1,0,3
4,HinduismVideos,990444590262976512,Is this the highest threshold of stupidity? 😂 ...,0,0,0,1,0,3
...,...,...,...,...,...,...,...,...,...
21110,Pontifex,1183706515800129538,"The Lord gives each of us a vocation, a challe...",0,0,0,1,0,3
21111,alislam,1182348148687032320,Press Release: Head of #Ahmadiyya Muslim Commu...,0,0,0,1,0,3
21112,marcjacobs,1197281212185939969,Chelsea Werner wearing THE Prom Dress from #TH...,0,0,0,0,1,4
21113,PositivePrayers,1193423103390560256,Are you living the abundant life?\n\nJesus sai...,0,0,0,1,0,3


In [10]:
df.describe()

Unnamed: 0,Handle,TweetID,Tweet,Toxic,Information,Sports,Religious,Advertisement,Classification
count,21115.0,21115.0,21115,21115,21115,21115,21115,21115,21115
unique,99.0,18795.0,20110,2,2,2,2,2,5
top,,,"RT @AllahGreatQuran: Trust Allah, He is the on...",0,0,0,0,0,3
freq,1023.0,1023.0,6,19193,16316,17715,14462,16374,6253


In [11]:
#data split into train and test
train_ind = int(len(data) // 1.5)
df_train = df[:train_ind]
df_test = df[train_ind:]
print(df_train.shape, df_test.shape)

(14076, 9) (7039, 9)


In [0]:
#Convert tweet to TFID vector
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, stop_words='english')
trn_term_doc = vec.fit_transform(df_train["Tweet"])
test_term_doc = vec.transform(df_test["Tweet"])

In [0]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)
    
x = trn_term_doc
test_x = test_term_doc

In [0]:
def get_mdl(y):
    y = y.values
    y = np.array([int(i) for i in y])
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [15]:
#Create models for each class
label_cols = ['Toxic', 'Information', 'Sports', 'Religious', 'Advertisement']
preds = np.zeros((len(df_test), len(label_cols)))
models = {}
for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(df_train[j])
    models[j] = (m, r)
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit Toxic
fit Information
fit Sports
fit Religious
fit Advertisement


In [16]:
models 

{'Advertisement': (LogisticRegression(C=4, class_weight=None, dual=True, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='warn', n_jobs=None, penalty='l2',
                     random_state=None, solver='warn', tol=0.0001, verbose=0,
                     warm_start=False),
  matrix([[ 1.03980435,  1.06448845, -1.25193752, ...,  1.86744853,
            0.28774837,  0.43056192]])),
 'Information': (LogisticRegression(C=4, class_weight=None, dual=True, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='warn', n_jobs=None, penalty='l2',
                     random_state=None, solver='warn', tol=0.0001, verbose=0,
                     warm_start=False),
  matrix([[-0.55518752, -1.57092602, -0.57854971, ...,  0.59271021,
            0.2966774 ,  0.43949095]])),
 'Religious': (LogisticRegression(C=4, class_weight=None, dual=True, fit_intercept=T

In [17]:
#split into parts for cross validation
accuracy_scores = {}
for label in label_cols:
    accuracy_scores[label] = []

kf = KFold(n_splits=5)
iteration = 1
for train_index, test_index in kf.split(df):
    print("Iteration:", iteration)
    iteration += 1
    for model in models.keys():
        print(model)
        x_train, x_test = df["Tweet"][train_index], df["Tweet"][test_index]
        y_train, y_test = df[model][train_index], df[model][test_index]
        logreg = LogisticRegression(C=4.0)
        # logreg = models[model][0]

        logreg.fit(vec.fit_transform(x_train),y_train)
        y_pred = logreg.predict(vec.transform(x_test))

        #scores
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        precision = precision_score(y_test, y_pred, average='macro')
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_scores[model].append(accuracy)
        print("\t Recall:", recall)
        print("\t F1 Score:", f1)
        print("\t Precision:", precision)
        print("\t Accuracy:", accuracy)
  

Iteration: 1
Toxic
	 Recall: 0.9491094147582697
	 F1 Score: 0.9705929459280666
	 Precision: 0.9948320413436693
	 Accuracy: 0.990528060620412
Information
	 Recall: 0.845461810996369
	 F1 Score: 0.8697180994468114
	 Precision: 0.9036507487159879
	 Accuracy: 0.9135685531612597
Sports
	 Recall: 0.8816203125234593
	 F1 Score: 0.9166156580116497
	 Precision: 0.9637614795142619
	 Accuracy: 0.959033862183282
Religious
	 Recall: 0.9211544005857353
	 F1 Score: 0.9331769400884479
	 Precision: 0.9484680527050166
	 Accuracy: 0.9438787591759413
Advertisement
	 Recall: 0.8927706124680551
	 F1 Score: 0.9160422734169269
	 Precision: 0.9460120662186918
	 Accuracy: 0.9460099455363485
Iteration: 2
Toxic
	 Recall: 0.962059620596206
	 F1 Score: 0.9784719693549093
	 Precision: 0.996393611540443
	 Accuracy: 0.9933696424342884
Information
	 Recall: 0.8575502345908917
	 F1 Score: 0.8782605098998173
	 Precision: 0.9055196197555572
	 Accuracy: 0.9183045228510538
Sports
	 Recall: 0.8701226434268774
	 F1 Score: 0.9

In [18]:
#Evaluation
for score in accuracy_scores:
  print(score, ":", np.average(accuracy_scores[score]))

Toxic : 0.991427894861473
Information : 0.918920198910727
Sports : 0.9578025100639355
Religious : 0.9435472412976557
Advertisement : 0.9467203409898175


# User Testing

Test your own sentences

In [0]:
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, stop_words='english')
trn_term_doc = vec.fit_transform(df_train["Tweet"])
test_term_doc = vec.transform(df_test["Tweet"])

In [0]:
def test(tweet):
    test_data = vec.transform([tweet])
    json = {}
    #model predicting: 1 is in that category, 0 is not
    for model in models.keys():
        result = models[model][0].predict(test_data.multiply(models[model][1]))
        json[model] = result[0]
    return json

Some sentences have been provided.

> Results may be different depending on what partition of the output_copy.csv we use for the training data due to shuffling of the dataset.



In [21]:
test("The Knicks scored 3-1 in the most recent game")

{'Advertisement': 0, 'Information': 0, 'Religious': 0, 'Sports': 1, 'Toxic': 0}

In [22]:
test("Fuck you stupid bitch")

{'Advertisement': 0, 'Information': 0, 'Religious': 0, 'Sports': 0, 'Toxic': 1}

In [23]:
test("fuck you")

{'Advertisement': 0, 'Information': 0, 'Religious': 0, 'Sports': 0, 'Toxic': 1}

In [24]:
test("Buy this fabulous scarf. Get it now for only 3 payments of $5.")

{'Advertisement': 1, 'Information': 0, 'Religious': 0, 'Sports': 0, 'Toxic': 0}

In [25]:
test("Next up, the Mets and their home game!")

{'Advertisement': 0, 'Information': 0, 'Religious': 0, 'Sports': 1, 'Toxic': 0}

In [26]:
test("mom, i want to eat ice cream")

{'Advertisement': 0, 'Information': 0, 'Religious': 0, 'Sports': 0, 'Toxic': 0}

In [27]:
test("Theory says black holes like this shouldn’t get so big.")

{'Advertisement': 0, 'Information': 1, 'Religious': 0, 'Sports': 0, 'Toxic': 0}

In [28]:
test("John 3:16″For God so loved the world that he gave his one and only Son, that whoever believes in him shall not perish but have eternal life.")

{'Advertisement': 0, 'Information': 0, 'Religious': 1, 'Sports': 0, 'Toxic': 0}