# Imports

In [1]:
import glob
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from collections import defaultdict
import itertools
import numpy as np

import nltk
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

from sklearn import metrics, naive_bayes, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import ensemble, model_selection
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split

from wordcloud import WordCloud ,STOPWORDS
import seaborn as sns

from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
stop_words_en = stopwords.words('english')
stop_words_es = stopwords.words('spanish')

import spacy
nlpEN = spacy.load('en_core_web_sm')
# nlpES = spacy.load('es_core_news_sm')

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Data Loading Functions

In [5]:
"""
Loading the data   --- these functions are taken and modified from ashraf2019
"""


def iter_docs(author):
    author_attr = author.attrib
    doc_dict = author_attr.copy()
    doc_dict['text'] = [' '.join([doc.text for doc in author.iter('document')])]
    return doc_dict


def create_data_frame(input_folder):
    os.chdir(input_folder)
    all_xml_files = glob.glob("*.xml")
    truth_data = pd.read_csv('truth.txt', sep=':::', names=['author_id', 'author'], engine="python")
    temp_list_of_DataFrames = []
    text_Data = pd.DataFrame()
    for file in all_xml_files:
        etree = ET.parse(file)  # create an ElementTree object
        doc_df = pd.DataFrame(iter_docs(etree.getroot()))
        doc_df['author_id'] = file[:-4]
        temp_list_of_DataFrames.append(doc_df)
    text_Data = pd.concat(temp_list_of_DataFrames, axis=0)

    data = text_Data.merge(truth_data, on='author_id')
    return data



    

In [6]:
# English Training Dataset
en_data = create_data_frame("C:/Users/VivAndMourhaf/PycharmProjects/HelloWorld/data/pan21/en")
print("English data size", len(en_data))

English data size 200


In [50]:
# X = en_data['text']


In [51]:
# not needed, use class?
# y = en_data['author']


In [7]:
%%time
# loading bad words
file = open('D:/data/bad_words.txt', encoding='utf-8')
bad_words_array=[]
for line in file:
    values = line.split("\r")
    word = values[0].strip()
    bad_words_array.append(word)
file.close()

file = open('D:/data/hate_words.txt', encoding='utf-8')
hate_words_array=[]
for line in file:
    values = line.split("\r")
    word = values[0].strip()
    hate_words_array.append(word)
file.close()

Wall time: 998 µs


In [6]:
# hate_words_array

In [8]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [8]:
def preprocess_(data):
    corpus = []
    for tweets in data:
        tweets_lowered = tweets.lower()

        # Further tweet sanitation
#         tweet = re.sub(r'\s+[a-z]\s+', ' ', tweets_lowered) # remove single characters like i and a
#         tweet = re.sub(r'^[a-z]\s+', ' ', tweet)  # remove single characters at the beginning like i and a
#         tweet = re.sub(r'\srt\s+', '', tweet)  # remove extra spaces
#         tweet = re.sub(r'#user#', ' #user# ', tweet)  # remove extra spaces
#         tweet = re.sub(r'#url#', 'url', tweets_lowered)  # remove extra spaces
#         tweet = re.sub('[^A-Za-z0-9]+', '', tweet)  # remove extra spaces
#         tweet = re.sub(r'\s+', ' ', tweet)  # remove extra spaces
#         tokenizedTweet = nlpEN(tweets_lowered)
        #tweets_tokenized = word_tokenize(tweet)
#         tokenizedTweet = ["NORP" if ent.label_ == "PERSON " else "GEO" if ent.label_ == "ORG " for ent in tokenizedTweet.ents]
#         pprint([(X.text, X.label_) for X in doc.ents])
        #tweets_tokenized = word_tokenize(tweet)
#         spacy_process(tokenizedTweet)
        tweets_tokenized = word_tokenize(str(tweets_lowered))
        
#         tweets_no_stopwords = [w for w in tweets_tokenized if w not in stop_words_en]
#         tweets_no_special_char = [w for w in tweets_no_stopwords if w.isalnum()]
        tweets_no_bads = ["BAD_WORD" if w in bad_words_array else w for w in tweets_tokenized]
        tweets_corpus = ["HATE_WORD" if w in hate_words_array else w for w in tweets_no_bads]

        #tweets_tokenized = word_tokenize(tweet)
        #tweets_tokenized = word_tokenize(tweets_lowered)
        #tweets_corpus = [w for w in tweets_tokenized if w not in stop_words_en]
        # lemmatize words
#         tokenizedTweet = nlpEN(tweet)
        # Sentiment analyzer
#         analyser = SentimentIntensityAnalyzer()
#         score = analyser.polarity_scores(tweet)
#         # Convert dictionary into string
#         score = str(score) 
#         processedTweet = []
#         for l in tweet:
#             #processedTweet.append(f"{l.lemma_}")
# #             processedTweet.append(f"{l.lemma_}({l.pos_})")
#         processedTweet.append(score)
        corpus.append(' '.join(tweets_corpus))
    return corpus

# def replace_person(token):
#     if token.ent_iob != 0 and token.ent_type_ == 'PERSON':
#         print(token.text)
#         return 'PERSON_TAG'
#     return token.text 


# def replace_geo(token):
#     if token.ent_iob != 0 and token.ent_type_ == 'ORG':
#         return 'GEO_TAG'
#     return token.text 

# def spacy_process(doc):
#     with doc.retokenize() as retokenizer:
#         for ent in doc.ents:
#             retokenizer.merge(ent)
#     tokens = map(replace_person, doc)
#     return ''.join(tokens)

In [9]:
# # create a dummy dataframe
# training_data = pd.DataFrame()
# # load the preprocessed text to it otherwise en_data['text'] stays the same
# training_data['preprocessed_text'] = preprocess_(en_data['text'])


In [10]:
# list(training_data['preprocessed_text'][0:3])

In [10]:
# proc_train_split = preprocess_(X_train)
# proc_test_split = preprocess_(X_test)


In [8]:
# view the data after cleaning
# # proc_train_split[0:5]
# en_data[0:18]

In [12]:
# run vsm on word ngrams
# vectorizer_en = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(3,5), max_features=1000, stop_words=stop_words_en)
# vectorized_training = vectorizer_en.fit_transform(proc_train_split).toarray()
# vectorized_testing = vectorizer_en.transform(proc_test_split).toarray()
# svr = SVC(kernel='linear', C=1000)
# svr.fit(vectorized_training, y_train)
# print("SVM:" , accuracy_score(svr.predict(vectorized_testing),y_test))

SVM: 0.55


In [13]:
# # run vsm on char ngrams
# vectorizer_en = TfidfVectorizer(analyzer='char', ngram_range=(2,5))#char2-5 
# vectorized_training = vectorizer_en.fit_transform(proc_train_split).toarray()
# vectorized_testing = vectorizer_en.transform(proc_test_split).toarray()
# svr = SVC(kernel='linear', C=1000)
# svr.fit(vectorized_training, y_train)
# print("SVM:" , accuracy_score(svr.predict(vectorized_testing),y_test))


SVM: 0.775


In [11]:
# lists from https://unicode.org/emoji/charts/emoji-list.html
def face_neutral_skeptical(text):
    return len([c for c in text if c in '🤐🤨😐😑😶😒🙄😬🤥🧐'])

def face_concerned(text):
    return len([c for c in text if c in '😕😟🙁☹😮😯😲😳🥺😦😧😨😰😥😢😭😱😖😣😞😓😩😫🥱🙀😿'])

def face_negative(text):
    return len([c for c in text if c in '😤😡😠🤬😈👿💀☠😾'])

def face_costume(text):
    return len([c for c in text if c in '💩🤡👹👺👻👽👾🤖'])

def body_parts(text):
    return len([c for c in text if c in '👋🤚🖐✋🖖👌🤌🤏✌🤞🤟🤘🤙👈👉👆🖕👇☝👍👎✊👊🤛🤜👏🙌👐🤲🤝🙏💪👀👅👄'])

def people(text):
    return len([c for c in text if c in '👶🧒👦👧🧑👱👨🧔👩🧓👴👵🙍🙍‍♂️🙎🙅🙆💁🙋🧏🙇🤦🤦‍♂️🤷🤷‍♂️'])

def bad_words(text):
    return len([c for c in text.split() if c == "BAD_WORD"])

def hate_words(text):
    return len([c for c in text.split() if c == "HATE_WORD"])

def user_count(text):
    return len([c for c in text.split() if c == "user"])

def rt_counts(text):
    return len([c for c in text.split() if c == "rt"])

def url_counts(text):
    return len([c for c in text.split() if c == "url"])

def person_counts(text):
    return len([c for c in text.split() if c == "NORP"])

def geo_counts(text):
    return len([c for c in text.split() if c == "GEO"])

In [12]:
def preprocess(data):
    data['face_neutral_skeptical'] = data['preprocessed_text'].apply(face_neutral_skeptical)
    data['face_concerned'] = data['preprocessed_text'].apply(face_concerned)
    data['face_negative'] = data['preprocessed_text'].apply(face_negative)
    data['face_costume'] = data['preprocessed_text'].apply(face_costume)
    data['body_parts'] = data['preprocessed_text'].apply(body_parts)
    data['people'] = data['preprocessed_text'].apply(people)
    data['person_counts'] = data['preprocessed_text'].apply(person_counts)
    data['geo_counts'] = data['preprocessed_text'].apply(geo_counts)
    data['bad_words'] = data['preprocessed_text'].apply(bad_words)
    data['hate_words'] = data['preprocessed_text'].apply(hate_words)
    data['user_count'] = data['preprocessed_text'].apply(user_count)
    data['rt_counts'] = data['preprocessed_text'].apply(rt_counts)
    data['url_counts'] = data['preprocessed_text'].apply(url_counts)


In [13]:
# create a dummy dataframe
training_data = pd.DataFrame()
# load the preprocessed text to it otherwise en_data['text'] stays the same
training_data['preprocessed_text'] = preprocess_(en_data['text'])
# siena's counters function
preprocess(training_data)

In [14]:
# Example of preprocessed data
list(training_data['preprocessed_text'][0:3])

["BAD_WORD new york # url # # user # # user # i think i 'm in love trump is awesome # url # # user # you have the greatest tweets sweetheart # user # # user # it 's free pizza hun , just free food # user # love you joy can i help cnn , i can piss on ya # url # she is a wet potato sack # url # # user # long gone darlin i like them too # url # rt # user # : retweet if you think michael moore is a worthless loser ! # url # rt # user # : did cnn just re-unite the alt-right & amp ; the new right in a common cause ? 🤔 # url # oh , a willnot # url # rt # user # : hey # user # ! just saw a 12 year old with a video camera and a trump t-shirt , might want to pay this clown a visit # user # is so low , he can crawl under a snake and not alert it , better yet , he 's a goof # url # let me get that for ya honey # url # i 'll eat what the BAD_WORD i want # url # # user # you mean this BAD_WORD , yep you nailed it , he is an BAD_WORD # url # wow # user # at those jeans # url # what usa looks like tod

In [15]:
# how the data now looks
training_data

Unnamed: 0,preprocessed_text,face_neutral_skeptical,face_concerned,face_negative,face_costume,body_parts,people,person_counts,geo_counts,bad_words,hate_words,user_count,rt_counts,url_counts
0,BAD_WORD new york # url # # user # # user # i ...,0,2,1,1,12,3,0,0,33,1,197,59,131
1,romanian graftbuster ’ s firing violated right...,0,0,0,0,0,0,0,0,2,1,58,2,205
2,`` hey jamal ( snickering uncontrollable ) you...,2,2,1,0,3,4,0,0,3,3,94,6,69
3,that BAD_WORD still fried to me the homie let ...,1,6,2,0,9,22,0,0,117,31,59,28,20
4,poc love talking about police brutality but no...,0,0,7,0,8,1,0,0,50,78,15,6,121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,me and kayleigh just waking up lol we finna be...,3,38,0,0,4,32,0,0,57,8,168,155,14
196,i had BAD_WORD going on 2020 summer was fun no...,1,2,0,0,15,36,0,0,34,6,48,45,7
197,rt # user # : contact your legislators and let...,0,1,0,0,0,2,0,0,5,0,140,93,168
198,"rt # user # : i 'm sorry but , what ? # url # ...",0,2,0,0,0,1,0,0,11,9,339,94,93


In [16]:
# how the data used to look
en_data

Unnamed: 0,lang,class,text,author_id,author
0,en,1,Fuck New York #URL# #USER# #USER# I think I'm ...,043e2766cc6d22ae4e447ca5f2885a2a,1
1,en,0,"Romanian graftbuster’s firing violated rights,...",06893abba0bb8f94fed7562350233ed7,0
2,en,1,"""Hey Jamal (snickering uncontrollable) You wan...",0a3ce42bea89e2a92a28f685735e605e,1
3,en,1,That shit still fried to me the homie let one ...,0a6700c6023c6249bcc5820e2f5ee0de,1
4,en,1,POC love talking about police brutality but no...,0d02a3f644c9313315ecc6655ccfa3b9,1
...,...,...,...,...,...
195,en,1,me and kayleigh just waking up lol we finna be...,f91fa8ecdd2440eb163516769573f24a,1
196,en,0,I had shit going on 2020 summer was fun no lie...,fdb47a3f65091b9a5b989e1722c9fac4,0
197,en,1,RT #USER#: Contact your legislators and let th...,fdb9f16899e3097e6db1f6a13d3572f8,1
198,en,0,"RT #USER#: I'm sorry but, what? #URL# #URL# RT...",fdef657f264ca50bc7b21574b24f82ab,0


In [17]:
# the features is everying in training data minus the first column
features = training_data.drop(['preprocessed_text'], axis=1)
features

Unnamed: 0,face_neutral_skeptical,face_concerned,face_negative,face_costume,body_parts,people,person_counts,geo_counts,bad_words,hate_words,user_count,rt_counts,url_counts
0,0,2,1,1,12,3,0,0,33,1,197,59,131
1,0,0,0,0,0,0,0,0,2,1,58,2,205
2,2,2,1,0,3,4,0,0,3,3,94,6,69
3,1,6,2,0,9,22,0,0,117,31,59,28,20
4,0,0,7,0,8,1,0,0,50,78,15,6,121
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,3,38,0,0,4,32,0,0,57,8,168,155,14
196,1,2,0,0,15,36,0,0,34,6,48,45,7
197,0,1,0,0,0,2,0,0,5,0,140,93,168
198,0,2,0,0,0,1,0,0,11,9,339,94,93


In [18]:
X_train, X_test, y_train, y_test = train_test_split(features, en_data['author'], test_size=0.2)

In [None]:

# vectorizer_en = TfidfVectorizer(analyzer='char', ngram_range=(2,5))#char2-5 
# vect_x_train = vectorizer_en.fit_transform(X_train).toarray()
# vect_x_test = vectorizer_en.transform(X_test).toarray()

svr = SVC(kernel='linear', C=1000)
svr.fit(X_train, y_train)
print("SVM:" , accuracy_score(svr.predict(X_test),y_test))


In [None]:

en_ML_model = SVC(C=1000, kernel='linear')
en_ML_model.fit(X_train,y_train)
print("SVM:" , accuracy_score(en_ML_model.predict(X_test),y_test))
print("-----------------------------------------")

en_ML_model = LogisticRegression()
en_ML_model.fit(X_train,y_train)
print("LR:" , accuracy_score(en_ML_model.predict(X_test),y_test))
print("-----------------------------------------")

en_ML_model = ensemble.RandomForestClassifier()
en_ML_model.fit(X_train,y_train)
print("Random forest:" , accuracy_score(en_ML_model.predict(X_test),y_test))
print("-----------------------------------------")

en_ML_model = DecisionTreeClassifier()
en_ML_model.fit(X_train,y_train)
print("Decision Tree:" , accuracy_score(en_ML_model.predict(X_test),y_test))


# below this line is old code - do not run ----------------------------------------------------------

In [146]:
# hypperparameter tuning section - too expensive to run
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20,50,100,200,500,1000],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': ensemble.RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10, 50, 100, 200, 400, 500, 700]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10, 50, 100,500]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
        }
    }    
}

In [154]:
# hypperparameter tuning section - too expensive to run
evaluation_en = []
for model_name, mp in model_params.items():
        clf = GridSearchCV(mp['model'], mp['params'], cv= 10, return_train_score=False )
        clf.fit(X_train,y_train)
        evaluation_en.append({
            'model': model_name,
            'best_score': clf.best_score_,
            'best_params': clf.best_params_
        })
df = pd.DataFrame(evaluation_en, columns=['model', 'best_score', 'best_params'])
print('Best English parameters')
print(df)

KeyboardInterrupt: 

In [None]:
# English Training Dataset
en_training_div_data, en_testing_div_data = train_test_split(create_data_frame(
                                            r"C:/Users/VivAndMourhaf/PycharmProjects/HelloWorld/data/pan21/en"), train_size=0.70)
print("English training data split size", len(en_training_div_data))
print("English testing data split size", len(en_testing_div_data))
en_training_div_data


In [None]:
# Spanish Training Dataset
es_training_div_data, es_testing_div_data = train_test_split(create_data_frame(
                                            r"C:\Users\loren\Dropbox\Estudios\Universidad\Uni Tübingen\ISCL\Projects\PAN\Data\es"), train_size=0.70)
print("Spanish training data split size", len(es_training_div_data))
print("Spanish testing data split size", len(es_testing_div_data))
es_training_div_data

### Dataset Exploration

In [None]:
# Distribution
# Data distribution is exactly even
print('English distribution')
print(sns.countplot(en_training_div_data.author))

In [None]:
print('Spanish distribution')
print(sns.countplot(es_training_div_data.author))

In [None]:
# Tweet length distribution
# Observation: most tweet bodies are between 8000 and 20000 chars
# Spanish (orange) tweets tend to be slightly longer
en_training_div_data['tweet_length'] = en_training_div_data['text'].str.len()
sns.distplot(en_training_div_data['tweet_length']).set_title('Tweet length distribution')
es_training_div_data['tweet_length'] = es_training_div_data['text'].str.len()
sns.distplot(es_training_div_data['tweet_length']).set_title('Tweet length distribution')

### Wordcloud visualizations

In [None]:
def create_wordcloud(words):
    wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110, collocations=False).generate(words)
    plt.figure(figsize=(10, 7))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis('off')
    plt.show()

In [None]:
print('English non hate spreaders common words:')
subset=en_training_div_data[en_training_div_data.author==0]
text=subset.text.values
words =" ".join(text)
create_wordcloud(words)

print('English hate spreaders common words:')
subset=en_training_div_data[en_training_div_data.author==1]
text=subset.text.values
words =" ".join(text)
create_wordcloud(words)

In [None]:
print('Spanish non hate spreaders common words:')
subset=es_training_div_data[es_training_div_data.author==0]
text=subset.text.values
words =" ".join(text)
create_wordcloud(words)

print('Spanish hate spreaders common words:')
subset=es_training_div_data[es_training_div_data.author==1]
text=subset.text.values
words =" ".join(text)
create_wordcloud(words)


# Step 2: Preprocessing

### Corpus functions for English and Spanish

In [None]:
# gensim only accepts tokenized lists
# so this preprocesses and coverts tweet-body strings into tokens
def build_tokenized_corpus_en(data):
    corpus = []
    for tweets in data:
        tweets_lowered = tweets.lower()

        # Further tweet sanitation
        tweet = re.sub(r'\s+[a-z]\s+', ' ', tweets_lowered) # remove single characters like i and a
        tweet = re.sub(r'^[a-z]\s+', ' ', tweet)  # remove single characters at the beginning like i and a
        tweet = re.sub(r'\s+', ' ', tweet)  # remove extra spaces

        #tweets_tokenized = word_tokenize(tweet)
        #tweets_tokenized = word_tokenize(tweets_lowered)
        #tweets_corpus = [w for w in tweets_tokenized if w not in stop_words_en]
        # lemmatize words
        tokenizedTweet = nlpEN(tweet)
        # Sentiment analyzer
        analyser = SentimentIntensityAnalyzer()
        score = analyser.polarity_scores(tweet)
        # Convert dictionary into string
        score = str(score) 
        processedTweet = []
        for l in tokenizedTweet:
            #processedTweet.append(f"{l.lemma_}")
            processedTweet.append(f"{l.lemma_}({l.pos_})")
        processedTweet.append(score)
        corpus.append(processedTweet)
    #return corpus
        #corpus.append(tweets_corpus)

    return corpus
    #tweets_lowered = [tweets.lower() for tweets in data]
    #tweets_tokenized = [word_tokenize(tweets) for tweets in tweets_lowered]
    #tweets_corpus = [w for w in tweets_tokenized if w not in stop_words_en]


    #return tweet


In [None]:
# Build Spanish corpus
def build_tokenized_corpus_es(data):
    corpus = []
    for tweets in data:
        tweets_lowered = tweets.lower()

        # Further tweet sanitation
        tweet = re.sub(r'\s+[a-z]\s+', ' ', tweets_lowered) # remove single characters like i and a
        tweet = re.sub(r'^[a-z]\s+', ' ', tweet)  # remove single characters at the beginning like i and a
        tweet = re.sub(r'\s+', ' ', tweet)  # remove extra spaces

        #tweets_tokenized = word_tokenize(tweet)
        #tweets_tokenized = word_tokenize(tweets_lowered)
        #tweets_corpus = [w for w in tweets_tokenized if w not in stop_words_es]

        #corpus.append(tweets_corpus)

        # lemmatize words
        tokenizedTweet = nlpES(tweet)
        # Sentiment analyzer
        analyser = SentimentIntensityAnalyzer()
        score = analyser.polarity_scores(tweet)
        # Convert dictionary into string
        score = str(score) 
        processedTweet = []
        for l in tokenizedTweet:
           # processedTweet.append(f"{l.lemma_}")
            processedTweet.append(f"{l.lemma_}({l.pos_})")
        processedTweet.append(score)
        corpus.append(processedTweet)
    return corpus

### Build the English and Spanish corpora

In [None]:
# English 
processed_train_split_corpus_en = en_training_div_data
processed_train_split_corpus_en['text'] = build_tokenized_corpus_en(en_training_div_data['text'])
processed_test_split_corpus_en = en_testing_div_data
processed_test_split_corpus_en['text'] = build_tokenized_corpus_en(en_testing_div_data['text'])


In [None]:
# Spanish bots
processed_train_split_corpus_es = es_training_div_data
processed_train_split_corpus_es['text'] = build_tokenized_corpus_es(es_training_div_data['text'])
processed_test_split_corpus_es = es_testing_div_data
processed_test_split_corpus_es['text'] = build_tokenized_corpus_es(es_testing_div_data['text'])

### Find the most common words using TF-IDF gensim

In [None]:
def find_most_common_en(processed_train_split_corpus):
    # Create a gensim dictionary
    dictionary = Dictionary(processed_train_split_corpus)
    # Turn the tweets (docs) into BOW
    corpus = [dictionary.doc2bow(doc) for doc in processed_train_split_corpus]
    unwanted_most_common_words = {}
    # Create the defaultdict: total_word_count
    total_word_count = defaultdict(int)
    for word_id, word_count in itertools.chain.from_iterable(corpus):
        total_word_count[word_id] += word_count

    # Create a sorted list from the defaultdict: sorted_word_count 
    sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

    # Find the top 100 words across all documents alongside the count
    for word_id, word_count in sorted_word_count[:100]:
        unwanted_most_common_words[dictionary.get(word_id)] = word_count

    # We need to keep track of the special tags we created in the preprocessing
    # so we just popped them from the 100 list
    #unwanted_most_common_words.pop("URL")
    #unwanted_most_common_words.pop("MENTION")
    #unwanted_most_common_words.pop("HASHTAG")
    #unwanted_most_common_words.pop("QUOTE")
    #unwanted_most_common_words.pop("rt")
    
    return list(unwanted_most_common_words.keys())

In [None]:
def find_most_common_es(processed_train_split_corpus):
    # Create a gensim dictionary
    dictionary = Dictionary(processed_train_split_corpus)
    # Turn the tweets (docs) into BOW
    corpus = [dictionary.doc2bow(doc) for doc in processed_train_split_corpus]
    unwanted_most_common_words = {}
    # Create the defaultdict: total_word_count
    total_word_count = defaultdict(int)
    for word_id, word_count in itertools.chain.from_iterable(corpus):
        total_word_count[word_id] += word_count

    # Create a sorted list from the defaultdict: sorted_word_count 
    sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

    # For Spanish, using only the top 100 words worked best
    for word_id, word_count in sorted_word_count[:100]:
        unwanted_most_common_words[dictionary.get(word_id)] = word_count
        
    #unwanted_most_common_words.pop("URL(PROPN)")
    #unwanted_most_common_words.pop("MENTION(PROPN)")
    #unwanted_most_common_words.pop("HASHTAG(PROPN)")
    #unwanted_most_common_words.pop("QUOTE(PROPN)")
    #unwanted_most_common_words.pop("rt(CCONJ)")
    
    return list(unwanted_most_common_words.keys())

In [None]:
# English
unwanted_en = find_most_common_en(processed_train_split_corpus_en)

In [None]:
# Spanish
unwanted_es = find_most_common_es(processed_train_split_corpus_es)

### Convert the lists of tokens back to tweet_bodies while deleting all unwanted tokens

In [None]:
def combine_tokens_into_tweets(data):
    #corpus = []
    for i in range(0, len(data)):
        #tweet = data[i]
        #tweet = ' '.join([w for w in tweet])
        tweet = [' '.join(t) for t in data]
     #   corpus.append(tweet)
    return tweet

def remove_words_from_tweets(data, unwanted):
    corpus = []
    for i in range(0, len(data)):
        tweet = data[i].split()
        tweet = ' '.join([w for w in tweet if w not in unwanted])
        corpus.append(tweet)
    return corpus

In [None]:
# English
combined_training_en = processed_train_split_corpus_en
combined_training_en['text'] = combine_tokens_into_tweets(processed_train_split_corpus_en['text'])
combined_testing_en = processed_test_split_corpus_en
combined_testing_en['text'] = combine_tokens_into_tweets(processed_test_split_corpus_en['text'])
#processed_training_corpus_en = remove_words_from_tweets(combined_training_en, unwanted_en)
#processed_testing_corpus_en = remove_words_from_tweets(combined_testing_en, unwanted_en)
processed_training_corpus_en = combined_training_en
processed_testing_corpus_en = combined_testing_en

In [None]:
# Spanish
combined_training_es = processed_train_split_corpus_es
combined_training_es['text'] = combine_tokens_into_tweets(processed_train_split_corpus_es['text'])
combined_testing_es = processed_test_split_corpus_es
combined_testing_es['text'] = combine_tokens_into_tweets(processed_test_split_corpus_es['text'])
#processed_training_corpus_es = remove_words_from_tweets(combined_training_es, unwanted_es)
#processed_testing_corpus_es = remove_words_from_tweets(combined_testing_es, unwanted_es)
processed_training_corpus_es = combined_training_es
processed_testing_corpus_es = combined_testing_es

### Setting labels

In [None]:
#English
training_corpus_labels_en = en_training_div_data['author']
testing_corpus_labels_en = en_testing_div_data['author']

In [None]:
#Spanish
training_corpus_labels_es = es_training_div_data['author']
testing_corpus_labels_es = es_testing_div_data['author']

### Wordclouds after preprocessing the data

In [None]:
print('English non hate spreaders common words:')
subset=processed_training_corpus_en[processed_training_corpus_en.author==0]
text=subset.text.values
words =" ".join(text)
create_wordcloud(words)

print('English hate spreaders common words:')
subset=processed_training_corpus_en[processed_training_corpus_en.author==1]
text=subset.text.values
words =" ".join(text)
create_wordcloud(words)


In [None]:
print('Spanish non hate spreaders common words:')
subset=processed_training_corpus_es[processed_training_corpus_es.author==0]
text=subset.text.values
words =" ".join(text)
create_wordcloud(words)

print('Spanish hate spreaders common words:')
subset=processed_training_corpus_es[processed_training_corpus_es.author==1]
text=subset.text.values
words =" ".join(text)
create_wordcloud(words)

### Vectorize the tweets

In [None]:
# English
vectorizer_en = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(3,5), max_features=500, stop_words=stop_words_en)
vectorized_en_training_div_tweets = vectorizer_en.fit_transform(processed_training_corpus_en['text']).toarray()
vectorized_en_testing_div_tweets = vectorizer_en.transform(processed_testing_corpus_en['text']).toarray()

In [None]:
# Spanish
vectorizer_es = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(3,5), max_features=500, stop_words=stop_words_es)
vectorized_es_training_div_tweets = vectorizer_es.fit_transform(processed_training_corpus_es['text']).toarray()
vectorized_es_testing_div_tweets = vectorizer_es.transform(processed_testing_corpus_es['text']).toarray()

# Step 3: ML Hyperparameter validation testing

### Baseline classifier testing (no hyperparameters)

In [None]:
# English
print('English baselines')
en_ML_model = SVC()
en_ML_model.fit(vectorized_en_training_div_tweets, training_corpus_labels_en)
print("SVM:" , accuracy_score(en_ML_model.predict(vectorized_en_testing_div_tweets),testing_corpus_labels_en))


en_ML_model = ensemble.RandomForestClassifier()
en_ML_model.fit(vectorized_en_training_div_tweets, training_corpus_labels_en)
print("Random forest:" , accuracy_score(en_ML_model.predict(vectorized_en_testing_div_tweets),testing_corpus_labels_en))

en_ML_model = DecisionTreeClassifier()
en_ML_model.fit(vectorized_en_training_div_tweets, training_corpus_labels_en)
print("Decision Tree:" , accuracy_score(en_ML_model.predict(vectorized_en_testing_div_tweets),testing_corpus_labels_en))


In [None]:
# Spanish
print('Spanish baselines')
es_ML_model = SVC(C=20, kernel='linear')
es_ML_model.fit(vectorized_es_training_div_tweets,training_corpus_labels_es)
print("SVM:" , accuracy_score(es_ML_model.predict(vectorized_es_testing_div_tweets), testing_corpus_labels_es))

es_ML_model = ensemble.RandomForestClassifier(n_estimators= 200)
es_ML_model.fit(vectorized_es_training_div_tweets,training_corpus_labels_es)
print("Random forest:" , accuracy_score(es_ML_model.predict(vectorized_es_testing_div_tweets), testing_corpus_labels_es))

es_ML_model = DecisionTreeClassifier(criterion='gini')
es_ML_model.fit(vectorized_es_training_div_tweets,training_corpus_labels_es)
print("Decision Tree:" , accuracy_score(es_ML_model.predict(vectorized_es_testing_div_tweets), testing_corpus_labels_es))


### Make a parameter dictionary to apply gridsearchCV on to find the best model for the task

In [None]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': ensemble.RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10, 50, 100, 200, 400, 500, 700]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10, 50, 100]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
        }
    }    
}

### Perform gridsearchCV on the parameter dictionary

In [None]:
# English
evaluation_en = []
for model_name, mp in model_params.items():
        clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False )
        clf.fit(vectorized_en_training_div_tweets,training_corpus_labels_en)
        evaluation_en.append({
            'model': model_name,
            'best_score': clf.best_score_,
            'best_params': clf.best_params_
        })
df = pd.DataFrame(evaluation_en, columns=['model', 'best_score', 'best_params'])
print('Best English parameters')
print(df)


In [None]:
# Spanish
evaluation_es = []
for model_name, mp in model_params.items():
        clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False )
        clf.fit(vectorized_es_training_div_tweets,training_corpus_labels_es)
        evaluation_es.append({
            'model': model_name,
            'best_score': clf.best_score_,
            'best_params': clf.best_params_
        })
df = pd.DataFrame(evaluation_es, columns=['model', 'best_score', 'best_params'])
print('Best Spanish parameters')
print(df)

### Test with the best parameters (must change parameters based on gridsearch results manually)

In [None]:
# English 
print ('English validation set accuracies')

en_ML_model = SVC(C=1, kernel='linear')
en_ML_model.fit(vectorized_en_training_div_tweets,training_corpus_labels_en)
print("SVM:" , accuracy_score(en_ML_model.predict(vectorized_en_testing_div_tweets),testing_corpus_labels_en))
# print(classification_report(en_ML_model.predict(vectorized_en_testing_div_tweets),testing_corpus_labels_en))
print("-----------------------------------------")


en_ML_model = ensemble.RandomForestClassifier(n_estimators=10)
en_ML_model.fit(vectorized_en_training_div_tweets,training_corpus_labels_en)
print("Random forest:" , accuracy_score(en_ML_model.predict(vectorized_en_testing_div_tweets),testing_corpus_labels_en))
# print(classification_report(en_ML_model.predict(vectorized_en_testing_div_tweets),testing_corpus_labels_en))
print("-----------------------------------------")

en_ML_model = GaussianNB()
en_ML_model.fit(vectorized_en_training_div_tweets,training_corpus_labels_en)
print("Naive Bayes G:" , accuracy_score(en_ML_model.predict(vectorized_en_testing_div_tweets),testing_corpus_labels_en))
# print(classification_report(en_ML_model.predict(vectorized_en_testing_div_tweets),testing_corpus_labels_en))
print("-----------------------------------------")


en_ML_model = MultinomialNB()
en_ML_model.fit(vectorized_en_training_div_tweets,training_corpus_labels_en)
print("Naive Bayes MN:" , accuracy_score(en_ML_model.predict(vectorized_en_testing_div_tweets),testing_corpus_labels_en))
# print(classification_report(en_ML_model.predict(vectorized_en_testing_div_tweets),testing_corpus_labels_en))
print("-----------------------------------------")


en_ML_model = DecisionTreeClassifier(criterion='gini')
en_ML_model.fit(vectorized_en_training_div_tweets,training_corpus_labels_en)
print("Decision Tree:" , accuracy_score(en_ML_model.predict(vectorized_en_testing_div_tweets),testing_corpus_labels_en))
# print(classification_report(en_ML_model.predict(vectorized_en_testing_div_tweets),testing_corpus_labels_en))

In [None]:
# Spanish 
print('Spanish bot task validation set accuracies')

es_ML_model = SVC(C=1, kernel='linear')
es_ML_model.fit(vectorized_es_training_div_tweets,training_corpus_labels_es)
print("SVM:" , accuracy_score(es_ML_model.predict(vectorized_es_testing_div_tweets),testing_corpus_labels_es))
# print(classification_report(es_ML_model.predict(vectorized_es_testing_div_tweets),testing_corpus_labels_es))
print("-----------------------------------------")


es_ML_model = ensemble.RandomForestClassifier(n_estimators=400)
es_ML_model.fit(vectorized_es_training_div_tweets,training_corpus_labels_es)
print("Random forest:" , accuracy_score(es_ML_model.predict(vectorized_es_testing_div_tweets),testing_corpus_labels_es))
# print(classification_report(es_ML_model.predict(vectorized_es_testing_div_tweets),testing_corpus_labels_es))
print("-----------------------------------------")

es_ML_model = GaussianNB()
es_ML_model.fit(vectorized_es_training_div_tweets,training_corpus_labels_es)
print("Naive Bayes G:" , accuracy_score(es_ML_model.predict(vectorized_es_testing_div_tweets),testing_corpus_labels_es))
# print(classification_report(es_ML_model.predict(vectorized_es_testing_div_tweets),testing_corpus_labels_es))
print("-----------------------------------------")


es_ML_model = MultinomialNB()
es_ML_model.fit(vectorized_es_training_div_tweets,training_corpus_labels_es)
print("Naive Bayes MN:" , accuracy_score(es_ML_model.predict(vectorized_es_testing_div_tweets),testing_corpus_labels_es))
# print(classification_report(es_ML_model.predict(vectorized_es_testing_div_tweets),testing_corpus_labels_es))
print("-----------------------------------------")


es_ML_model = DecisionTreeClassifier(criterion='entropy')
es_ML_model.fit(vectorized_es_training_div_tweets,training_corpus_labels_es)
print("Decision Tree:" , accuracy_score(es_ML_model.predict(vectorized_es_testing_div_tweets),testing_corpus_labels_es))
# print(classification_report(es_ML_model.predict(vectorized_es_testing_div_tweets),testing_corpus_labels_es))

In [None]:
#To do:
    # Try different preprocessing steps
    # Try removing most common words
    # Add list of Hate Speech terms (look for corpus)