**Import libraries**

In [51]:
# Reading files
import os
import json

# Data cleaning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Model util
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# from imblearn.over_sampling import SMOTE

# Modelling
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


# Model evaluation
from sklearn import metrics
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, cohen_kappa_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import classification_report

# nlp pre-processing
import string
import nltk
import re
import unidecode
from scipy.sparse import vstack, hstack
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.tokenize import TweetTokenizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to C:\Users\Jun
[nltk_data]     Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Jun
[nltk_data]     Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Jun
[nltk_data]     Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Jun Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Read data

In [52]:
df = pd.read_excel('full_clean_df.xlsx')
labels_name_list = ['NotHate', 'Racist', 'Sexist', 'Homophobe', 'Religion', 'OtherHate']

In [53]:
df.head()

Unnamed: 0.1,Unnamed: 0,Tweets,NotHate,Racist,Sexist,Homophobe,Religion,OtherHate,tweets_train,tweets_emoji_train,tweets_nig_train
0,0,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,1,0,0,0,0,0,nigga momma youngboy spit real shit nigga,nigga momma youngboy spit real shit nigga,momma youngboy spit real shit
1,1,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,1,1,0,0,0,0,xxsugvngxx ran holy nigga today,xxsugvngxx ran holy nigga today loudly_crying_...,xxsugvngxx ran holy today
2,2,“EVERYbody calling you Nigger now!” https://t....,1,1,0,0,0,0,everybody call nigger,everybody call nigger,everybody call nigger
3,3,“ real ass bitch give a fuck boutta nigga” htt...,1,0,0,0,0,0,real bitch give fuck boutta nigga,real bitch give fuck boutta nigga,real bitch give fuck boutta
4,4,@WhiteHouse @realDonaldTrump Fuck ice. White s...,0,1,0,0,0,1,fuck ice white supremacist trash racist garbage,fuck ice white supremacist trash racist garbage,fuck ice white supremacist trash racist garbage


### Data Dictionary <a class="anchor" id="dict"></a>

|Column Name|Variable Name| Description
|---|:---:|:---
|id|id|Unique identifier for each tweet
|Tweets|Tweet content|Body of tweet
|Label|classification of label|Multi-class label: sexism, racism or none

## 5. Modelling <a class="anchor" id="model"></a>

### No. of topics: 6

In [25]:
# from sklearn.decomposition import LatentDirichletAllocation

# LDA = LatentDirichletAllocation(n_components=6, random_state=1)
# LDA.fit(dtm)

In [26]:
# for i,topic in enumerate(LDA.components_):
#     print(f'Top 10 words for topic #{i}:')
#     print([vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
#     print('\n')

In [27]:
# topic distribution for LDA
# topic_values_LDA = LDA.transform(dtm)
# df['topics'] = topic_values_LDA.argmax(axis=1)
# df['topics'].value_counts()

In [28]:
# df['topics'].value_counts(normalize=True)

### No. of topics: 7

In [29]:
# LDA2 = LatentDirichletAllocation(n_components=7, random_state=1)
# LDA2.fit(dtm)

In [30]:
# for i,topic in enumerate(LDA2.components_):
#     print(f'Top 10 words for topic #{i}:')
#     print([vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
#     print('\n')

In [31]:
# topic_values_LDA2 = LDA2.transform(dtm)
# df['topics2'] = topic_values_LDA2.argmax(axis=1)
# df['topics2'].value_counts()

In [32]:
# df['topics2'].value_counts(normalize=True)

### No. of topics: 5

In [33]:
# LDA3 = LatentDirichletAllocation(n_components=5, random_state=1)
# LDA3.fit(dtm)

In [34]:
# for i,topic in enumerate(LDA3.components_):
#     print(f'Top 10 words for topic #{i}:')
#     print([vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
#     print('\n')

In [35]:
# topic_values_LDA3 = LDA3.transform(dtm)
# df['topics3'] = topic_values_LDA3.argmax(axis=1)
# df['topics3'].value_counts()

In [36]:
# topic_values_LDA3

In [37]:
# df['topics3'].value_counts(normalize=True)

In [38]:
# df.head(20)

### Consolidated function

In [55]:
# feed cleaned tweets into this function (after initial cleaning & lemmatization & removing stopwords)
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

def topic_modelling(df, tweet_column_name = 'clean_tweets', no_of_topics = 6, no_of_features = 10, \
                    max_df = 0.5, min_df = 10, seed = 1):
    '''
    Funtion input: df: Full dataframe with cleaned tweet
                   tweet_column_name: name of column to be vectorized 
                   no_of_topics: n_component for LDA (optional)
                   no_of_features: n_features for LDA (optional)
                   max_df: hyperparameter for CountVectorizer (optional)
                   min_df: hyperparameter for CountVectorizer (optional)
                   seed: random_state (optional)
    Funtion output: Original dataframe with cleaned tweet + probability table
    '''
    df[tweet_column_name] = df[tweet_column_name].astype(str) # just in case
    
    # tokenization default max_df = 0.5, min df =10
    vect = CountVectorizer(max_df=0.5, min_df=10)
    dtm = vect.fit_transform(df[tweet_column_name])
    
    # initialise LDA
    LDA = LatentDirichletAllocation(n_components = no_of_topics, random_state = seed)
    LDA.fit(dtm)
    
    # prints topics distribution (optional)
    topic_modelling.topic_values_LDA = LDA.transform(dtm)
    topics_dist = pd.DataFrame(topic_modelling.topic_values_LDA.argmax(axis=1), columns = ['topics'])
    print('--------------------------------------------------------------------------')
    print(f'Topics distribution (%)')
    print('--------------------------------------------------------------------------')
    print(topics_dist['topics'].value_counts(normalize = True), '\n')
    
    # prints no_of_features in each topic (default features = 10)
    print('--------------------------------------------------------------------------')
    print(f'Topic features (topics: {no_of_topics}, features: {no_of_features}).')
    print('--------------------------------------------------------------------------')
    for i,topic in enumerate(LDA.components_):
        print(f'Top {no_of_features} words for topic #{i}:')
        print([vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
        print('\n')
    
    # concat original df with probability table
#     df_topics = pd.DataFrame(topic_modelling.topic_values_LDA, columns = [f'topic{str(i+1)}_prob' for i in range(topic_modelling.topic_values_LDA.shape[1])])
    df['topics'] = topic_modelling.topic_values_LDA.argmax(axis=1)
#     df_with_tm = pd.concat([df, df_topics], axis = 1)
    
#     return df_with_tm
    return df

In [56]:
df = topic_modelling(df, 'tweets_train')

--------------------------------------------------------------------------
Topics distribution (%)
--------------------------------------------------------------------------
3    0.347732
5    0.173322
2    0.155922
1    0.116704
0    0.116034
4    0.090287
Name: topics, dtype: float64 

--------------------------------------------------------------------------
Topic features (topics: 6, features: 10).
--------------------------------------------------------------------------
Top 10 words for topic #0:
['islam', 'amp', 'sexist', 'follow', 'people', 'would', 'make', 'hillbilly', 'woman', 'cunt']


Top 10 words for topic #1:
['get', 'say', 'kat', 'dick', 'call', 'van', 'nigger', 'mkr', 'faggot', 'dyke']


Top 10 words for topic #2:
['play', 'go', 'call', 'race', 'card', 'trash', 'white', 'fuck', 'cunt', 'retard']


Top 10 words for topic #3:
['ai', 'know', 'like', 'bitch', 'real', 'fuck', 'shit', 'say', 'get', 'nigga']


Top 10 words for topic #4:
['go', 'little', 'never', 'get', 'full',

In [57]:
# df_topic_model = pd.concat([df, topic_modelling.df_topics], axis = 1)
# df_topic_model.head()
df.head()

Unnamed: 0,Tweets,NotHate,Racist,Sexist,Homophobe,Religion,OtherHate,tweets_train,tweets_emoji_train,tweets_nig_train,topics
0,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,1,0,0,0,0,0,nigga momma youngboy spit real shit nigga,nigga momma youngboy spit real shit nigga,momma youngboy spit real shit,3
1,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,1,1,0,0,0,0,xxsugvngxx ran holy nigga today,xxsugvngxx ran holy nigga today loudly_crying_...,xxsugvngxx ran holy today,3
2,“EVERYbody calling you Nigger now!” https://t....,1,1,0,0,0,0,everybody call nigger,everybody call nigger,everybody call nigger,2
3,“ real ass bitch give a fuck boutta nigga” htt...,1,0,0,0,0,0,real bitch give fuck boutta nigga,real bitch give fuck boutta nigga,real bitch give fuck boutta,3
4,@WhiteHouse @realDonaldTrump Fuck ice. White s...,0,1,0,0,0,1,fuck ice white supremacist trash racist garbage,fuck ice white supremacist trash racist garbage,fuck ice white supremacist trash racist garbage,2
