In [38]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
import re

In [39]:
#import the dataset
tweets = pd.read_csv("TwitterHate.csv")
tweets.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [40]:
tweets.label.value_counts(normalize=True)

0    0.929854
1    0.070146
Name: label, dtype: float64

In [41]:
#its  a highly unbalanced dataset

In [42]:
#getting tweets into list
tweet_list = tweets.tweet.values
print(len(tweet_list))
tweet_list[:4]

31962


array([' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
       "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
       '  bihday your majesty',
       '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  '],
      dtype=object)

In [43]:
#to cleanup
#1-Normalize the casing.

#2-Using regular expressions, remove user handles. These begin with '@’.

#3-Using regular expressions, remove URLs.

#4- Using TweetTokenizer from NLTK, tokenize the tweets into individual terms.

#5-Remove stop words.

#6-Remove redundant terms like ‘amp’, ‘rt’, etc.

#7-Remove ‘#’ symbols from the tweet while retaining the term.

In [44]:
#lowercasing the tweets
lower_tweets = [tweet.lower() for tweet in tweet_list]
lower_tweets[:4]

[' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ']

In [45]:
#removing @words
no_user= [re.sub("@\w+","",tweet) for tweet in lower_tweets]
no_user[:4]

['  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "  thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ']

In [46]:
#removing urls
no_url = [re.sub("\w+://\S+","",tweet) for tweet in no_user ]
no_url[0:4]

['  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "  thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ']

In [47]:
#tokenization
from nltk.tokenize import TweetTokenizer
token = TweetTokenizer()
final_token = [token.tokenize(sen) for sen in no_url]
final_token[0]

['when',
 'a',
 'father',
 'is',
 'dysfunctional',
 'and',
 'is',
 'so',
 'selfish',
 'he',
 'drags',
 'his',
 'kids',
 'into',
 'his',
 'dysfunction',
 '.',
 '#run']

In [48]:
#now removing stopwords , one letter words, punctuation
from nltk.corpus import stopwords
from string import punctuation
stop_nltk = stopwords.words("english")
stop_punct= list(punctuation)
stop_punct.extend(['...','``',"''",".."])
stop_context = ['rt', 'amp']   #Remove redundant terms like ‘amp’, ‘rt’, etc.
stop_final = stop_nltk + stop_punct + stop_context

In [49]:
stop_final[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [50]:
#cleaning stop_final from our tweets
def Remover(sent):
    return [re.sub("#","",term) for term in sent if((term not in stop_final) & (len(term)>1))]

In [51]:
Remover(final_token[0])

['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run']

In [52]:
#final tweets
clean_tweets = [Remover(token) for token in final_token]
clean_tweets[:4]

[['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run'],
 ['thanks',
  'lyft',
  'credit',
  "can't",
  'use',
  'cause',
  'offer',
  'wheelchair',
  'vans',
  'pdx',
  'disapointed',
  'getthanked'],
 ['bihday', 'majesty'],
 ['model', 'love', 'take', 'time', 'urð']]

In [53]:
#Check out the top terms in the tweets:
from collections import Counter
top_term = []
for tweet in clean_tweets:
    top_term.extend(tweet)
top_term[:4]

['father', 'dysfunctional', 'selfish', 'drags']

In [54]:
top_tweets = Counter(top_term)
top_tweets.most_common(10)

[('love', 2748),
 ('day', 2276),
 ('happy', 1684),
 ('time', 1131),
 ('life', 1118),
 ('like', 1047),
 ("i'm", 1018),
 ('today', 1013),
 ('new', 994),
 ('thankful', 946)]

In [55]:
#Join the tokens back to form strings. This will be required for the vectorizers.
#Assign x and y.
#Perform train_test_split using sklearn.

In [56]:
clean_tweets[0]

['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run']

In [57]:
clean_tweets =[" ".join(tweet) for tweet in clean_tweets]
clean_tweets[0]

'father dysfunctional selfish drags kids dysfunction run'

In [58]:
from sklearn.model_selection import train_test_split
x=clean_tweets
y=tweets.label
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.30, random_state= 1)
print("x_train : ",len(x_train)," y_train: ",len(y_train))
print("x_test : ",len(x_test)," y_test: ",len(y_test))

x_train :  22373  y_train:  22373
x_test :  9589  y_test:  9589


We’ll use TF-IDF values for the terms as a feature to get into a vector space model.

Import TF-IDF  vectorizer from sklearn.

Instantiate with a maximum of 5000 terms in your vocabulary.

Fit and apply on the train set.

Apply on the test set.

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
vector = TfidfVectorizer(max_features = 5000)
#fitting the data
x_train_bow = vector.fit_transform(x_train)
x_test_bow = vector.transform(x_test)


In [60]:
print(x_train_bow.shape)
print(x_train_bow[:2])
#its basically a matrix of all the words in a tweets

(22373, 5000)
  (0, 2591)	0.3467439954465794
  (0, 4327)	0.39908824976483
  (0, 2778)	0.39908824976483
  (0, 4718)	0.329545038234488
  (0, 1556)	0.40587121081222693
  (0, 3405)	0.3833937860490035
  (0, 3808)	0.3753712459004943
  (1, 1870)	0.26325018218405183
  (1, 3632)	0.8192200307985136
  (1, 227)	0.2903502739989284
  (1, 3149)	0.2123230368961195
  (1, 2663)	0.3608234043275218


In [61]:
from sklearn.linear_model import LogisticRegression
lgr = LogisticRegression()
lgr.fit(x_train_bow, y_train)

LogisticRegression()

In [62]:
y_train_pred = lgr.predict(x_train_bow)
y_test_pred = lgr.predict(x_test_bow)

In [64]:
print(y_train_pred.shape)

(22373,)


In [67]:
from sklearn.metrics import accuracy_score , classification_report
accuracy_score(y_train, y_train_pred)

0.955035086935145

In [68]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98     20780
           1       0.96      0.38      0.55      1593

    accuracy                           0.96     22373
   macro avg       0.96      0.69      0.76     22373
weighted avg       0.96      0.96      0.95     22373



In [69]:
lgr2= LogisticRegression(class_weight= "balanced")
lgr2.fit(x_train_bow, y_train)

LogisticRegression(class_weight='balanced')

In [70]:
y_train_pred = lgr2.predict(x_train_bow)
y_test_pred = lgr2.predict(x_test_bow)

In [71]:
from sklearn.metrics import accuracy_score , classification_report
accuracy_score(y_train, y_train_pred)

0.9505654136682609

In [72]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97     20780
           1       0.59      0.97      0.74      1593

    accuracy                           0.95     22373
   macro avg       0.80      0.96      0.86     22373
weighted avg       0.97      0.95      0.96     22373




Regularization and Hyperparameter tuning:

Import GridSearch and StratifiedKFold because of class imbalance.

Provide the parameter grid to choose for ‘C’ and ‘penalty’ parameters.

Use a balanced class weight while instantiating the logistic regression.

In [73]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
# Create the parameter grid based on the results of random search 
param_grid = {
    'C': [0.01,0.1,1,10,100],
    'penalty': ["l1","l2"]
}

In [74]:
logger3 = LogisticRegression(class_weight = "balanced")

In [75]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = logger3, param_grid = param_grid, 
                          cv = StratifiedKFold(4), n_jobs = -1, verbose = 1, scoring = "recall" )

In [76]:

grid_search.fit(x_train_bow, y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


        nan 0.73006322        nan 0.69492985]


GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None, shuffle=False),
             estimator=LogisticRegression(class_weight='balanced'), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']},
             scoring='recall', verbose=1)

In [77]:
grid_search.best_estimator_

LogisticRegression(C=1, class_weight='balanced')

In [78]:
# Using the best estimator from the grid search to make predictions on the test set
y_test_pred = grid_search.best_estimator_.predict(x_test_bow)
y_train_pred = grid_search.best_estimator_.predict(x_train_bow)

In [79]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.98      0.93      0.96      8940
           1       0.46      0.77      0.58       649

    accuracy                           0.92      9589
   macro avg       0.72      0.85      0.77      9589
weighted avg       0.95      0.92      0.93      9589

