## Introduction

### Bahar Radfar - last update: May-2019

### This notebook presents how to train hostility detection model using both LSTM and Logistic Regression classification model. The main purpose is to understand the influence of relationship between the conversation partisipents on both the presence and intensity of hostile comments

In [20]:
import re
import cv2
import json
import math
import nltk
import keras
import string
import logging
import pickle as p
import regex as re
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from operator import itemgetter
from collections import Counter
from IPython import get_ipython
from subprocess import check_output
from string import punctuation, ascii_lowercase



from numpy import random
from numpy import array
from numpy import asarray
from numpy import zeros


from emoji_function import demojize
from nltk.corpus import stopwords



import keras
from keras.models import Model
from keras import backend as K
from keras.datasets import imdb
from keras.utils import np_utils
from keras.layers import Flatten
from keras.optimizers import Adam
from keras.layers import Embedding
from keras.models import Sequential
from keras.callbacks import TensorBoard
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import CuDNNLSTM, Bidirectional, LSTM
from keras.preprocessing.image import ImageDataGenerator
from keras.layers.normalization import BatchNormalization
from keras.layers import Input, Dense, Activation, Dropout,SpatialDropout1D
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

from sklearn.svm import SVC
from sklearn.metrics import auc
from sklearn.manifold import TSNE
import sklearn.metrics as metrics
from sklearn.utils import resample
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
from sklearn.utils import class_weight
from sklearn.metrics import roc_auc_score
import sklearn.preprocessing as preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
import sklearn.feature_extraction as feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score




import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer, SnowballStemmer


## Plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)


from myUtility import dataPrepration, cleanText, replaceThreeOrMore, preprocess, auroc

Using TensorFlow backend.


### The Deep learning model with LSTM layers

In [68]:
class HostilityDetectionModel:
    """
        The hostility ditection LSTM model 
        Params:
          args....A dictionary, representing configuration values necessery for the model
        Returns:
            The trained and tested model
          
       
    """
    def __init__(self, args):
        self.lstm_cells = args['lstm_cells'] #265
        self.opt = args['opt'] #keras.optimizers.Adam(lr=1e-3, decay=0.1)
        self.batch_size = args['batch_size']
        self.epochs = args['epochs']
        self.loss = args['loss'] #categorical_crossentropy
        self.comment_size = args['comment_size']  #(65)
        self.feature_size = args['feature_size'] #dim
        self.vocab_size = args['vocab_size']
        self.embedding_matrix = args['embedding_matrix']
        self.save_path = args['save_path']
        self.tensorBoard_directory = args["tensorBoard_directory"]
        
        self.build_model()
    
    def build_model(self):
        
        comment_input = Input(shape=(self.comment_size,), name="tweet_content")
        features_input = Input(shape=(self.feature_size,), name="features")
        
        wv_layer = Embedding(self.vocab_size, 200, weights=[self.embedding_matrix], 
                             input_length=self.comment_size, trainable=False)
        embedded_sequences = wv_layer(comment_input)

        embedded_sequences = SpatialDropout1D(0.2)(embedded_sequences)
        x = keras.layers.Bidirectional(keras.layers.CuDNNLSTM(self.lstm_cells, 
                                                              return_sequences=True))(embedded_sequences)
        x = keras.layers.Bidirectional(keras.layers.CuDNNLSTM(self.lstm_cells, 
                                                              return_sequences=False, return_state=False))(x)
        y = keras.layers.concatenate([x, features_input])
        y = keras.layers.Dense(256, activation="relu")(y)
        y = keras.layers.Dropout(0.20)(y)
        x = keras.layers.Dense(2, activation="softmax")(y)
        self.model = keras.models.Model(inputs=[comment_input, features_input], outputs=x)
        self.model.compile(loss= self.loss, optimizer=self.opt, metrics=["accuracy"])


    
    def train(self, data_train, feature_train, y_train, train_idx, val_idx):
        """
            The training process of the model
            Params:
              data_train......A list of lists, each representing the tokenized version of a tweet
              feature_train...A list of lists, each representing the tokenized version of a string
                              representing the features
              y_train.........A list of lists, each representing the one_hot version of the label
              train_idx.......A list of intigers, representing the index of items from data_train 
                              which should be considered as the training set
              val_idx.........A list of intigers, representing the index of items from data_train 
                              which should be considered as the validation set

        """

        data_train_cv = data_train[train_idx]
        feature_train_cv = feature_train[train_idx]
        y_train_cv = y_train[train_idx]

        data_valid_cv = data_train[val_idx]
        feature_valid_cv = feature_train[val_idx]
        y_valid_cv= y_train[val_idx]
        
        y_train_cv = keras.utils.to_categorical([0 if j == 0 else 1 for j in y_train_cv], num_classes=2)
        y_valid_cv = keras.utils.to_categorical([0 if j == 0 else 1 for j in y_valid_cv], num_classes=2)

        
        y_integers = np.argmax(y_train_cv, axis=1)
        class_weights = class_weight.compute_class_weight('balanced', np.unique(y_integers), y_integers)
        d_class_weights = dict(enumerate(class_weights))
        sample_weights = class_weight.compute_sample_weight('balanced', y_integers)
        
        
        tensor_board = TensorBoard(log_dir=self.tensorBoard_directory, histogram_freq=0, write_graph=True, 
                                   write_images=True)
        callbacks = [tensor_board]
        hist = self.model.fit(
                            [data_train_cv, feature_train_cv],
                             y_train_cv, 
                             epochs=self.epochs, 
                             batch_size= self.batch_size, 
                             validation_data = ([data_valid_cv, feature_valid_cv], y_valid_cv),
                             shuffle=True, 
                             sample_weight=sample_weights,
                             callbacks=callbacks)
        self.model.save(self.save_path)
        
    def test(self, data_test, feature_test):
        """
            The testing process of the model
            Params:
              data_test......A list of lists, each representing the tokenized version of a tweet
              feature_test...A list of lists, each representing the tokenized version of a string
                              representing the features
             Returns:
                 the list of lists, each representing predicted probability of a tweet

        """
        try:
            self.model = keras.models.load_model(self.save_path)
            print("Loaded trained model!")
        except:
            print("No previous model found, using untrained model.")
        print("Predicting...")
        self.y_pred = model.predict([test_data, test_groups], batch_size=self.batch_size)
        print("Prediction done!")
        return self.y_pred

In [62]:
def deepLearningModel(rootAddress, fileName, max_length=65, num_folds=5):
    """
        Featured model using both the length and relationship category of each tweet as a classifier feature.
        Reading the data and running the model using k_fold cross_validation 
        Params:
            rootAddress...A string representing the rood directory.
            fileName......A list of file names, representing the different relationship categories.
            max_length....An intiger, representing the max number of words to pick from each tweet 
            num_folds.....An intiger, representing the number of k_fold
        

    """
    
    # read the data
    filePath = rootAddress + fileName + '.csv'
    data= pd.read_csv(filePath,dtype={'group': str}).drop_duplicates('id', keep='first')
    
    tweets = data['text'].map(lambda x: preprocess(x))
    labels = data['hostile'].values
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(tweets)
    vocab_size = len(tokenizer.word_index) + 1
    encoded_docs = tokenizer.texts_to_sequences(tweets)
    text = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    
    # convert the relationship values to it's int representation
    relationshipDic = {
        '0': 0 ,
        '1': 1,
        '10': 2,
        '11': 3
    }
    results = []
    relationshipType = data['group'].values
    
    for i in range(len(text)):
        res ={
            'text':text[i],
            'group':relationshipType[i],
            'feature': np.array([relationshipDic[relationshipType[i]], len(text[i])]),
            'y':labels[i]
        }
        results.append(res)
    results_df = pd.DataFrame(results)
    
    # represent the different relation and len as a unique value in a string format 
    featuresString = ['relation_%s len_%s' %
                (x[0], x[1]) for x in results_df['feature'].values]
    
    vect = feature_extraction.text.CountVectorizer(binary = True, min_df = 0.0005)
    features = vect.fit_transform(featuresString).toarray()

    results_df['feature'] = [features[i] for i in range(len(text))]
    train_data  = results_df.as_matrix()
    
    # Use the glove w2v as the embeding 
    embeddings_index = dict()
    f = open('glove.6B.300d.word2vec.txt')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))
    
    # create a weight matrix for words in training docs
    embedding_matrix = zeros((vocab_size, 200))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    # dividing the data into 15% test and 85% (train and validation)
    X_train, X_test, y_train, y_test = train_test_split(train_data, labels, test_size=0.15, random_state = 123)
    
    # creating a list of touples for each fold, each representing the train indices and validation indicies 
    # for that round
    folds = list(StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1).split(X_train, y_train))

    X_train = pd.DataFrame(X_train)
    feature_train = np.array([i for i in X_train[0].values])
    data_train = np.array(X_train[2].values.tolist())  
    
    # set the training config arguments
    args = {
        "lstm_cells": 265,
        "opt":keras.optimizers.Adam(),
        "batch_size": 64,
        "epochs":40,
        "loss": "categorical_crossentropy",
        "comment_size": max_length,
        "feature_size": feature_train[0].shape[0],
        "vocab_size":vocab_size,
        "embedding_matrix":embedding_matrix,
        "save_path": "FeaturedModel.model",
        "tensorBoard_directory": rootAddress
    }
    
    
    X_test =  pd.DataFrame(X_test)
    
    # impliment the cross_validation manually
    for j, (train_idx, val_idx) in enumerate(folds):

        print('\nFold ',j)
        
        model = HostilityDetectionModel(args)
        model.train(data_train, feature_train, y_train, train_idx, val_idx)
        
        data_test = np.array(X_test[2].values.tolist())
        feature_test = np.array([i for i in X_test[0].values])
        y_pred = model.test(data_test, feature_test)
        y_pred = [1 if p[1] > 0.555 else 0 for p in y_pred]
        print(f1_score(y_test, y_pred, average='weighted'))
        print(metrics.classification_report(y_test, y_pred, digits=4))
        fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_pred)
        auc_rf = auc(fpr_rf, tpr_rf)
        print(auc_rf)
        print("-------------------")

In [69]:
def baseModel(rootAddress, fileName, max_length=65, num_folds=5):
    """
        Base model using the only the length of each tweet as a classifier feature.
        Reading the data and running the model using k_fold cross_validation 
        Params:
            rootAddress...A string representing the rood directory.
            fileName......A list of file names, representing the different relationship categories.
            max_length....An intiger, representing the max number of words to pick from each tweet 
            num_folds.....An intiger, representing the number of k_fold
        

    """
    # read the data
    filePath = rootAddress + fileName + '.csv'
    data= pd.read_csv(filePath,dtype={'group': str}).drop_duplicates('id', keep='first')
    

    tweets = data['text'].map(lambda x: preprocess(x))
    labels = data['hostile'].values
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(tweets)
    vocab_size = len(tokenizer.word_index) + 1
    encoded_docs = tokenizer.texts_to_sequences(tweets)
    text = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    
    results = []
    relationshipType = data['group'].values
    
    for i in range(len(text)):
        res ={
            'text':text[i],
            'group':relationshipType[i],
            'feature': np.array([len(text[i])]),
            'y':labels[i]
        }
        results.append(res)
    results_df = pd.DataFrame(results)
    
    # represent the different length as a unique value in a string format 
    featuresString = ['len_%s' %
                (x[0]) for x in results_df['feature'].values]
    
    vect = feature_extraction.text.CountVectorizer(binary = True, min_df = 0.0005)
    features = vect.fit_transform(featuresString).toarray()

    results_df['feature'] = [features[i] for i in range(len(text))]
    train_data  = results_df.as_matrix()
    
    # Use the glove w2v as the embeding
    embeddings_index = dict()
    f = open('glove.6B.300d.word2vec.txt')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))
    
    # create a weight matrix for words in training docs
    embedding_matrix = zeros((vocab_size, 200))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    # dividing the data into 15% test and 85% (train and validation)
    X_train, X_test, y_train, y_test = train_test_split(train_data, labels, test_size=0.15, random_state = 123)
    
    # creating a list of touples for each fold, each representing the train indices and validation indicies 
    # for that round
    folds = list(StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1).split(X_train, y_train))

    X_train = pd.DataFrame(X_train)
    feature_train = np.array([i for i in X_train[0].values])
    data_train = np.array(X_train[2].values.tolist())  
    
    # set the training config arguments
    args = {
        "lstm_cells": 265,
        "opt":keras.optimizers.Adam(),
        "batch_size": 64,
        "epochs":40,
        "loss": "categorical_crossentropy",
        "comment_size": max_length,
        "feature_size": feature_train[0].shape[0],
        "vocab_size":vocab_size,
        "embedding_matrix":embedding_matrix,
        "save_path": "BaseModel.model",
        "tensorBoard_directory": rootAddress
    }
    
    
    
    X_test =  pd.DataFrame(X_test)
    
    # impliment the cross_validation manually
    for j, (train_idx, val_idx) in enumerate(folds):

        print('\nFold ',j)
        
        model = HostilityDetectionModel(args)
        model.train(data_train, feature_train, y_train, train_idx, val_idx)
        
        data_test = np.array(X_test[2].values.tolist())
        feature_test = np.array([i for i in X_test[0].values])
        y_pred = model.test(data_test, feature_test)
        y_pred = [1 if p[1] > 0.555 else 0 for p in y_pred]
        print(f1_score(y_test, y_pred, average='weighted'))
        print(metrics.classification_report(y_test, y_pred, digits=4))
        fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_pred)
        auc_rf = auc(fpr_rf, tpr_rf)
        print(auc_rf)
        print("-------------------")

## The Logistic regression model

In [71]:
def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
    """
        Finding the most correlated feature regarding each label
        Params:
            vectorizer...An obj, representing the vecterized used in the model
            classifier...An obj, representing the classifier used in the model
            classlabel...An list of intigers, representing the classifier labels
            n............An intiger, representing the number of correlated feature
                        to show for each label
        

    """
    labelid = list(classifier.classes_).index(classlabel)
    feature_names = vectorizer.get_feature_names()
    bottomn = sorted(zip(classifier.coef_[labelid], feature_names))[-n:]
    topn = sorted(zip(classifier.coef_[labelid], feature_names))[:n]
    for coef, feat in topn:
        print(classlabel, feat,  coef*-1) 
    print("---------")
    for coef, feat in bottomn:
        print(1 - classlabel, feat, coef) 

#### read the data

In [75]:
data= pd.read_csv('sampleData7.csv',dtype={'group': str}).drop_duplicates('id', keep='first')
X = data['text'].map(lambda x: preprocess(x))
y = data['hostile'].values

### 1) LR model using only the tweet text 

In [77]:
col_names = ['tweet', 'hostility']
y = y.astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state = 123)
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('LR', LogisticRegressionCV(cv=5, max_iter=4000,random_state=0, solver='lbfgs',multi_class='multinomial', class_weight='balanced')),
              ])
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.7266470009832842
              precision    recall  f1-score   support

           0       0.71      0.74      0.73       502
           1       0.74      0.71      0.72       515

   micro avg       0.73      0.73      0.73      1017
   macro avg       0.73      0.73      0.73      1017
weighted avg       0.73      0.73      0.73      1017



#### The top 10 corelated word for hostile/non_hostile groups

In [78]:
classifier = nb.named_steps['LR']
vectorizer = nb.named_steps['vect']
most_informative_feature_for_class(vectorizer, classifier, 0)

0 emoji_face_with_tears_of_joy 0.9361011241773141
0 lmao 0.8541578648934544
0 love 0.8040690711430389
0 holy 0.7629214371135057
0 shit 0.7544601087310946
0 emoji_skull 0.6714727079002918
0 emoji_heavy_black_heart 0.6592753496746697
0 emoji_hundred_points_symbol 0.6165805429744967
0 lol 0.5943807699790422
0 it 0.5785230808921368
---------
1 dumb 0.7873820465990385
1 faggot 0.8039466783083321
1 dick 0.888475887009469
1 fucking 0.9126462590262244
1 pussy 0.9421538158570509
1 shut 1.0960886597839976
1 ass 1.4237217855046846
1 retard 1.6826023013771572
1 twat 2.096198498505101
1 cunt 2.6973273854289115


### 2) LR model using text + relationship category + length of each tweet

In [84]:
train_data = dataPrepration(data)
X_train, X_test, y_train, y_test = train_test_split(train_data, y, test_size=0.15, random_state = 123)
X_train = pd.DataFrame(X_train)
X_train.columns = ['features', 'group', 'text', 'label']


Method .as_matrix will be removed in a future version. Use .values instead.



#### Tokenize and vertoriez the text and prepare it for the model

In [85]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train['text'].values)
count_vect.vocabulary_.get(u'algorithm')
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
train_data = []
bagOfWords = X_train_tfidf.toarray()
features = X_train['features'].as_matrix().tolist()
for i in range(len(bagOfWords)):
    res ={
        'bagOfWords': bagOfWords[i],
        'feature': features[i],
    }
    train_data.append(res)
train_data = pd.DataFrame(train_data)
input_train = np.concatenate((bagOfWords, features), axis=1)


Method .as_matrix will be removed in a future version. Use .values instead.



In [86]:
LR = LogisticRegressionCV(cv=5, max_iter=4000,random_state=0, solver='lbfgs',multi_class='multinomial', class_weight='balanced')
clf = LR.fit(input_train, y_train)
X_test = pd.DataFrame(X_test)
X_test.columns = ['features', 'group', 'text', 'label']
X_new_counts = count_vect.transform(X_test['text'].values)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
bagOfWords = X_new_tfidf.toarray()
features = X_test['features'].as_matrix().tolist()
input_test = np.concatenate((bagOfWords, features), axis=1)
y_pred = clf.predict(input_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.7492625368731564
              precision    recall  f1-score   support

           0       0.76      0.72      0.74       502
           1       0.74      0.78      0.76       515

   micro avg       0.75      0.75      0.75      1017
   macro avg       0.75      0.75      0.75      1017
weighted avg       0.75      0.75      0.75      1017




Method .as_matrix will be removed in a future version. Use .values instead.



#### The top 10 corelated features for hostile/non_hostile groups

In [87]:
feature_names = count_vect.get_feature_names()
feature_names = feature_names + ['group_0', 'group_1','group_2', 'group_3']+['len_'+str(185-i) for i in range(185)]
labelid = list(LR.classes_).index(0)
bottomn = sorted(zip(LR.coef_[labelid], feature_names))[-10:]
topn = sorted(zip(LR.coef_[labelid], feature_names))[:10]
for coef, feat in topn:
    print(0, feat, coef * -1) 
print("---------")
for coef, feat in bottomn:
    print(1, feat, coef) 

0 holy 0.9574169804327665
0 emoji_face_with_tears_of_joy 0.8505113757510508
0 shit 0.8448341866658511
0 love 0.7580048191764939
0 emoji_heavy_black_heart 0.6005186832629114
0 lmao 0.5993324146151553
0 len_1 0.5985066055574346
0 it 0.5963867256938168
0 sex 0.5951104931517495
0 emoji_hundred_points_symbol 0.5646803676532116
---------
1 stfu 0.7725894224443299
1 fucking 0.8395251562884973
1 pussy 0.8682935444451275
1 dick 0.9262600199799326
1 faggot 0.9547985668923858
1 shut 1.0868778370613563
1 ass 1.5180356618088786
1 retard 1.6404435751802044
1 twat 2.1311659991288074
1 cunt 2.8506832375993425


### 3) LR model using only the relationship category + length of each tweet

In [88]:
train_data = dataPrepration(data)
X_train, X_test, y_train, y_test = train_test_split(train_data, y, test_size=0.15, random_state = 123)
X_train = pd.DataFrame(X_train)
X_train.columns = ['features', 'group', 'text', 'label']


Method .as_matrix will be removed in a future version. Use .values instead.



In [89]:
LR = LogisticRegression(solver='lbfgs',multi_class='multinomial', class_weight='balanced')
clf_notext = LR.fit(X_train['features'].as_matrix().tolist(), y_train)
X_test = pd.DataFrame(X_test)
X_test.columns = ['features', 'group', 'text', 'label']
y_pred = clf_notext.predict(X_test['features'].as_matrix().tolist())
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))


Method .as_matrix will be removed in a future version. Use .values instead.



accuracy 0.6420845624385447
              precision    recall  f1-score   support

           0       0.66      0.56      0.61       502
           1       0.63      0.72      0.67       515

   micro avg       0.64      0.64      0.64      1017
   macro avg       0.64      0.64      0.64      1017
weighted avg       0.64      0.64      0.64      1017




Method .as_matrix will be removed in a future version. Use .values instead.



#### The top 10 corelated features for hostile/non_hostile groups

In [90]:
feature_names = ['group_0', 'group_1','group_2', 'group_3']+['len_'+str(185-i) for i in range(185)]
labelid = list(LR.classes_).index(0)
bottomn = sorted(zip(LR.coef_[labelid], feature_names))[-10:]
topn = sorted(zip(LR.coef_[labelid], feature_names))[:10]
for coef, feat in topn:
    print(0, feat, coef * -1) 
print("---------")
for coef, feat in bottomn:
    print(1, feat, coef) 

0 len_96 0.7352415399105744
0 len_120 0.6272377467745343
0 len_1 0.6253751797945258
0 len_106 0.622477628644756
0 len_131 0.6122241549647793
0 len_135 0.576974544082422
0 len_155 0.4906285190311689
0 len_21 0.48470340343170226
0 len_183 0.4456120598960049
0 len_102 0.43692954843252413
---------
1 len_85 0.33818400145552846
1 len_60 0.3445024893368753
1 len_64 0.3965749595400604
1 len_4 0.449481716661065
1 len_126 0.4989853179195205
1 len_70 0.5283991788352906
1 len_136 0.5795901864500809
1 len_110 0.6648328740637315
1 len_114 0.6857273531214038
1 len_48 0.8034084256504896


## 4) four individual class for the four different relationship type

#### 4.1) LR model for No_friendship relationship category (00)

In [91]:
No_friendship_data = data.loc[(data['group'] == '0')]
X = No_friendship_data['text'].map(lambda x: preprocess(x))
y = No_friendship_data['hostile']

In [92]:
col_names = ['tweet', 'hostility']
y=y.astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state = 123)
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('LR', LogisticRegressionCV(cv=5, random_state=0,max_iter=5000, solver='lbfgs',multi_class='multinomial', class_weight='balanced')),
              ])
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6447368421052632
              precision    recall  f1-score   support

           0       0.31      0.28      0.30        60
           1       0.75      0.77      0.76       168

   micro avg       0.64      0.64      0.64       228
   macro avg       0.53      0.53      0.53       228
weighted avg       0.64      0.64      0.64       228



##### The top 10 corelated words for hostile/non_hostile groups in the No_friendship category

In [93]:
classifier = nb.named_steps['LR']
vectorizer = nb.named_steps['vect']
most_informative_feature_for_class(vectorizer, classifier, 0)

0 holy 2.446142254636991
0 sex 1.6364969429372238
0 year 1.6185930093757581
0 af 1.5098355638413636
0 calling 1.4959914534672643
0 kicks 1.4697137225391033
0 good 1.3388257344607206
0 allowances 1.2954928568817081
0 man 1.292666793529395
0 poor 1.292140696543601
---------
1 racist 1.1289965049489172
1 idiot 1.1292741901336472
1 shut 1.4398712813815069
1 dumb 1.4674769813726134
1 retard 1.4691623174526196
1 bitch 1.5831586508125148
1 cunt 1.6160741298876946
1 ass 1.6506006982341048
1 stupid 1.6698666591667308
1 twat 2.413823417734113


#### 4.2) LR model for dual_friendship relationship category (11)

In [94]:
dual_friendship_data = data.loc[(data['group'] == '11')]
X = dual_friendship_data['text'].map(lambda x: preprocess(x))
y = dual_friendship_data['hostile']

In [95]:
col_names = ['tweet', 'hostility']
y=y.astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state = 123)
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('LR', LogisticRegressionCV(cv=5, random_state=0,max_iter=5000, solver='lbfgs',multi_class='multinomial', class_weight='balanced')),
              ])
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.7155963302752294
              precision    recall  f1-score   support

           0       0.81      0.78      0.79       228
           1       0.53      0.57      0.55        99

   micro avg       0.72      0.72      0.72       327
   macro avg       0.67      0.67      0.67       327
weighted avg       0.72      0.72      0.72       327



##### The top 10 corelated words for hostile/non_hostile groups in dual_frienship category

In [96]:
classifier = nb.named_steps['LR']
vectorizer = nb.named_steps['vect']
most_informative_feature_for_class(vectorizer, classifier, 0)

0 emoji_face_with_tears_of_joy 0.8780989620857048
0 love 0.7539996081746743
0 shit 0.6296379932848155
0 lmao 0.5963574626448318
0 need 0.5073230990232419
0 sleep 0.4679403313887631
0 crazy 0.4529345636144052
0 didn 0.4475066562951526
0 know 0.4184010209704142
0 lol 0.40443323764212435
---------
1 fuck 0.528474164427784
1 he 0.53353971614079
1 attention 0.5506326528727972
1 gay 0.5917764575627112
1 big 0.5955822018051579
1 shut 0.727168857038017
1 ass 0.9714283576740315
1 retard 1.107097631026854
1 twat 1.226268692889429
1 cunt 1.9329277874652975


#### 4.3) LR model for sender_follow_target  relationship category (10)

In [97]:
sender_follow_target_data = data.loc[(data['group'] == '10')]
X = sender_follow_target_data['text'].map(lambda x: preprocess(x))
y = sender_follow_target_data['hostile']

In [98]:
col_names = ['tweet', 'hostility']
y=y.astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state = 123)
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('LR', LogisticRegressionCV(cv=5, random_state=0,max_iter=5000, solver='lbfgs',multi_class='multinomial', class_weight='balanced')),
              ])
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.7272727272727273
              precision    recall  f1-score   support

           0       0.68      0.73      0.71       109
           1       0.77      0.72      0.74       133

   micro avg       0.73      0.73      0.73       242
   macro avg       0.73      0.73      0.73       242
weighted avg       0.73      0.73      0.73       242



##### The top 10 corelated words for hostile/non_hostile groups for sender_follow_target category

In [99]:
classifier = nb.named_steps['LR']
vectorizer = nb.named_steps['vect']
most_informative_feature_for_class(vectorizer, classifier, 0)

0 emoji_face_with_tears_of_joy 0.7839834259421249
0 shit 0.734719591321404
0 nigga 0.6588222218033988
0 holy 0.5577799483141853
0 emoji_loudly_crying_face 0.49526572476902836
0 im 0.42472638353947356
0 actual 0.41251940510392565
0 emoji_smiling_face_with_heart__shaped_eyes 0.40932017284766276
0 thinking 0.3987249867297563
0 broke 0.3932755775623923
---------
1 real 0.44143423105868784
1 dick 0.44983789282949477
1 big 0.4579865842862346
1 faggot 0.5989484693575606
1 pussy 0.684587864569647
1 retard 0.6951833349162585
1 fucking 0.6981063659033089
1 ass 0.9632938378912917
1 twat 1.7240426807526485
1 cunt 2.186140312864676


#### 4.4) LR model for target_follow_sender relationship category  (01)

In [100]:
target_follow_sender_data = data.loc[(data['group'] == '1')]
X = target_follow_sender_data['text'].map(lambda x: preprocess(x))
y = target_follow_sender_data['hostile']

In [101]:
col_names = ['tweet', 'hostility']
y=y.astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state = 123)
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('LR', LogisticRegressionCV(cv=5, max_iter=5000, random_state=0, solver='lbfgs',multi_class='multinomial', class_weight='balanced')),
              ])
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6832579185520362
              precision    recall  f1-score   support

           0       0.60      0.63      0.62        89
           1       0.74      0.72      0.73       132

   micro avg       0.68      0.68      0.68       221
   macro avg       0.67      0.67      0.67       221
weighted avg       0.69      0.68      0.68       221



##### The top 10 corelated words for hostile/non_hostile groups for target_follow_sender category

In [102]:
classifier = nb.named_steps['LR']
vectorizer = nb.named_steps['vect']
most_informative_feature_for_class(vectorizer, classifier, 0)

0 shit 1.1984192675570837
0 emoji_face_with_tears_of_joy 0.5633359636902002
0 love 0.509795692970128
0 emoji_emoji_modifier_fitzpatrick_type__5 0.45713565956720925
0 right 0.43344565799124096
0 sex 0.41785966285665765
0 blank_comment 0.40657350541648424
0 rape 0.3849537961508587
0 lol 0.37461174247070766
0 holy 0.36988715979678305
---------
1 fuck 0.40459638857169067
1 pussy 0.46191341522292784
1 fucking 0.49116252921630105
1 shut 0.5097111620997169
1 faggot 0.6776075800224274
1 bitch 0.7479201436634332
1 ass 0.9186053986626841
1 twat 1.0780792839001838
1 retard 1.084588989046295
1 cunt 1.1909235219605536
