# Help Twitter Combat Hate Speech Using NLP and Machine Learning

### Question1
#### Load the tweets file using read_csv function from Pandas package. 


In [1]:
import pandas as pd
import numpy as np
import os
import re

In [5]:
df = pd.read_csv("TwitterHate.csv")
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [9]:
df.label.value_counts(normalize=True)

0    0.929854
1    0.070146
Name: label, dtype: float64

In [13]:
df.tweet.sample().values[0]

"#punjab govt., #bjp,   have nothing to do with #udtapunjab' row: vineet joshi  "

In [14]:
tweets0 = df.tweet.values

In [16]:
len(tweets0)

31962

In [17]:
tweets0[:5]

array([' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
       "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
       '  bihday your majesty',
       '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
       ' factsguide: society now    #motivation'], dtype=object)

### Question 2-3-4
#### Get the tweets into a list for easy text cleanup and manipulation.
#### To cleanup: 
#### Normalize the casing.
#### Using regular expressions, remove user handles. These begin with '@’.
#### Using regular expressions, remove URLs.
#### Using TweetTokenizer from NLTK, tokenize the tweets into individual terms.
#### Remove stop words.
#### Remove redundant terms like ‘amp’, ‘rt’, etc.
#### Remove ‘#’ symbols from the tweet while retaining the term.
#### Extra cleanup by removing terms with a length of 1.


In [18]:
tweets_lower = [twt.lower() for twt in tweets0]

In [19]:
tweets_lower[:5]

[' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

In [20]:
tweets_nouser = [re.sub("@\w+","", twt) for twt in tweets_lower]

In [21]:
tweets_nouser[:5]

['  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "  thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

In [22]:
tweet_nourl = [re.sub("\w+://\S+", '', twt) for twt in tweets_nouser]

In [23]:
tweet_nourl[:5]

['  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "  thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

In [24]:
from nltk.tokenize import TweetTokenizer
tkn = TweetTokenizer()
tweet_token = [tkn.tokenize(sent) for sent in tweet_nourl]
print(tweet_token[0])

['when', 'a', 'father', 'is', 'dysfunctional', 'and', 'is', 'so', 'selfish', 'he', 'drags', 'his', 'kids', 'into', 'his', 'dysfunction', '.', '#run']


In [25]:
from nltk.corpus import stopwords
from string import punctuation


In [26]:
stop_nltk = stopwords.words("english")
stop_punct = list(punctuation)

In [27]:
stop_punct.extend(['...','``',"''",".."])

In [28]:
stop_context = ['rt', 'amp']


In [29]:
stop_final = stop_nltk + stop_punct + stop_context

In [30]:
def del_stop(sent):
    return [re.sub("#","",term) for term in sent if ((term not in stop_final) & (len(term)>1))]

In [31]:
tweets_clean = [del_stop(tweet) for tweet in tweet_token]

### Question 5
#### Check out the top terms in the tweets:
#### First, get all the tokenized terms into one large list.
#### Use the counter and find the 10 most common terms.

In [32]:
from collections import Counter

In [33]:
term_list = []
for tweet in tweets_clean:
    term_list.extend(tweet)

In [34]:
res = Counter(term_list)
res.most_common(10)

[('love', 2748),
 ('day', 2276),
 ('happy', 1684),
 ('time', 1131),
 ('life', 1118),
 ('like', 1047),
 ("i'm", 1018),
 ('today', 1013),
 ('new', 994),
 ('thankful', 946)]

### Question 6
#### Data formatting for predictive modeling:
#### Join the tokens back to form strings. This will be required for the vectorizers.
#### Assign x and y.
#### Perform train_test_split using sklearn.

In [35]:
tweets_clean = [" ".join(tweet) for tweet in tweets_clean]

In [36]:
tweets_clean[0]

'father dysfunctional selfish drags kids dysfunction run'

In [37]:
X = tweets_clean
y = df.label.values

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)

### Question 7
#### We’ll use TF-IDF values for the terms as a feature to get into a vector space model.
#### Import TF-IDF  vectorizer from sklearn.
#### Instantiate with a maximum of 5000 terms in your vocabulary.
#### Fit and apply on the train set.
#### Apply on the test set.

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
vectorizer = TfidfVectorizer(max_features=5000)

In [41]:
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [42]:
X_train_bow.shape, X_test_bow.shape

((22373, 5000), (9589, 5000))

### Question 8
#### Model building: Ordinary Logistic Regression
#### Instantiate Logistic Regression from sklearn with default parameters.
#### Fit into  the train data.
#### Make predictions for the train and the test set.

In [43]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [44]:
logreg.fit(X_train_bow, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [45]:
y_train_pred = logreg.predict(X_train_bow)
y_test_pred = logreg.predict(X_test_bow)

### Question 9
#### Model evaluation: Accuracy, recall, and f_1 score.
#### Report the accuracy on the train set.
#### Report the recall on the train set: decent, high, or low.
#### Get the f1 score on the train set.

In [46]:
from sklearn.metrics import accuracy_score, classification_report

In [47]:
accuracy_score(y_train, y_train_pred)

0.9560184150538595

In [48]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     20815
           1       0.96      0.39      0.55      1558

    accuracy                           0.96     22373
   macro avg       0.96      0.69      0.76     22373
weighted avg       0.96      0.96      0.95     22373



### Question 10-11
#### Looks like you need to adjust the class imbalance, as the model seems to focus on the 0s.
#### Adjust the appropriate class in the LogisticRegression model.
#### Train again with the adjustment and evaluate.
#### Train the model on the train set.
#### Evaluate the predictions on the train set: accuracy, recall, and f_1 score.

In [49]:
logreg = LogisticRegression(class_weight="balanced")
logreg.fit(X_train_bow, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [50]:
y_train_pred = logreg.predict(X_train_bow)
y_test_pred = logreg.predict(X_test_bow)

In [51]:
accuracy_score(y_train, y_train_pred)

0.9535153980244044

In [52]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97     20815
           1       0.60      0.97      0.74      1558

    accuracy                           0.95     22373
   macro avg       0.80      0.96      0.86     22373
weighted avg       0.97      0.95      0.96     22373



### Question 12 - 13
#### Regularization and Hyperparameter tuning:
#### Import GridSearch and StratifiedKFold because of class imbalance.
#### Provide the parameter grid to choose for ‘C’ and ‘penalty’ parameters.
#### Use a balanced class weight while instantiating the logistic regression.
#### Find the parameters with the best recall in cross validation.
#### Choose ‘recall’ as the metric for scoring.
#### Choose stratified 4 fold cross validation scheme.
#### Fit into  the train set.

In [56]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [57]:
param_grid = {
    'C': [0.01,0.1,1,10,100],
    'penalty': ["l1","l2"]
}

In [58]:
classifier_lr = LogisticRegression(class_weight="balanced")

In [59]:
grid_search = GridSearchCV(estimator = classifier_lr, param_grid = param_grid,
                          cv = StratifiedKFold(4), n_jobs = -1, verbose = 1, scoring = 'recall')

In [60]:
grid_search.fit(X_train_bow, y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    5.1s finished


GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None, shuffle=False),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight='balanced',
                                          dual=False, fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='recall', verbose=1)

### Question 14
#### What are the best parameters?

In [61]:
grid_search.best_estimator_

LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

### Question 15
#### Predict and evaluate using the best estimator.
#### Use the best estimator from the grid search to make predictions on the test set.
#### What is the recall on the test set for the toxic comments?
#### What is the f_1 score?

In [62]:
y_test_pred = grid_search.best_estimator_.predict(X_test_bow)

In [63]:
y_train_pred = grid_search.best_estimator_.predict(X_train_bow)

In [64]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96      8905
           1       0.49      0.77      0.60       684

    accuracy                           0.93      9589
   macro avg       0.73      0.85      0.78      9589
weighted avg       0.95      0.93      0.93      9589

