# Importing libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
import sklearn
import string
import re

# Visualizing dataset

In [None]:
df=pd.read_csv("train.csv")
df.head()

Unnamed: 0,text,labels
0,@realDonaldTrump This is one of the worst time...,0
1,How about the crowd in Oval in today's #AUSvIN...,1
2,@skroskz @shossy2 @JoeBiden Biden &amp; his so...,0
3,#etsy shop: Benedict Donald so called presiden...,1
4,@realDonaldTrump Good build a wall around Arka...,0


In [None]:
print("Hate speech labels count:",df["labels"].value_counts()[0])
print("Not Hate speech labels count:",df["labels"].value_counts()[1])

Hate speech labels count: 2031
Not Hate speech labels count: 3235


### Since classes are not balanced, F1 score would be better metric to evaluate our model

# Train Test split

In [None]:
y=df["labels"].values
x=df["text"].values
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=3)
x_test_2=x_test.copy()

# Prepeocessing:
- Convert to lowercase
- Remove URL
- Remove digits
- Remove mentions
- Apply lemmatizer
- Apply stemmer
- Remove punctuation

In [None]:
def preprocess(text_string):
    text_string=text_string.lower()
    mention_regex = '@[\w\-]+'
    url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    parsed_text = re.sub(url_regex, '', text_string)
    parsed_text=re.sub(pattern=r"\d", repl=r" ", string=parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    return parsed_text

def stemming(x):
    st = WordNetLemmatizer()
    words=x.strip().split()
    from nltk.stem import PorterStemmer
    st2 = PorterStemmer()
    from nltk.corpus import stopwords
    sw=set(stopwords.words('english'))
    ans=[st.lemmatize(x) for x in words if x not in sw]
    return [st2.stem(x) for x in ans]
def remove_special_char(x):
    punc = string.punctuation
    punc = punc.replace("'", "")
    pat= r"[{}]".format(punc)
    x=re.sub(pat, " ", x)
    return x

In [None]:
print("Train samples: ",x_train.shape)
print("Test samples: ",x_test.shape)

Train samples:  (4212,)
Test samples:  (1054,)


# Apply prepeocesing

In [None]:
print("Before preprocessing:")
print(x_train[2])
for i in range(len(x_train)):
    x_train[i]=preprocess(x_train[i])
    x_train[i]=remove_special_char(x_train[i])
    x_train[i]=" ".join(stemming(x_train[i]))
    
for i in range(len(x_test)):
    x_test[i]=preprocess(x_test[i])
    x_test[i]=remove_special_char(x_test[i])
    x_test[i]=" ".join(stemming(x_test[i]))
print("After preprocessing:")
print(x_train[2])

Before preprocessing:
'Dhoni will always be my Captain' - 7th time so far Kohli has said these lines since Dhoni stepped down from Captaincy !!!!💗💗💗  @ChennaiIPL @imVkohli @msdhoni #IndiaWithDhoni #DhoniKeepTheGlove #DhoniKeepsTheGlove #ICCCricketWorldCup2019 https://t.co/u7Pfr9WBJA
After preprocessing:
'dhoni alway captain' th time far kohli said line sinc dhoni step captainci 💗💗💗 indiawithdhoni dhonikeeptheglov dhonikeepstheglov icccricketworldcup


# Applying TFIDF vectorizer

In [None]:
vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1,2))
vectorizer.fit(x_train)
x_train = vectorizer.transform(x_train)
x_test= vectorizer.transform(x_test)

# Using support vector classifier

In [None]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(x_train, y_train)
y_pred=clf.predict(x_test)

# Evaluation Metrics

In [None]:
print("accuracy: ",sklearn.metrics.accuracy_score(y_pred,y_test))
print("f1 score: ",sklearn.metrics.f1_score(y_pred,y_test))
print("matrix:\n",sklearn.metrics.confusion_matrix(y_pred,y_test))  

accuracy:  0.6802656546489564
f1 score:  0.7887147335423198
matrix:
 [[ 88  35]
 [302 629]]


# Visualizing predictions

In [None]:
for i in range(3):
    print(x_test_2[i])
    if(y_pred[i]==0):
        print("Prediction: hate")
    else:
        print("Prediction: not hate")
    if(y_test[i]==0):
        print("Actual: hate")
    else:
        print("Actual: not hate")
    print()

'WE THE PEOPLE'                   👀  ⬇⬇⬇⬇⬇⬇⬇⬇   #HongKongProtest                   💪  ⬇⬇⬇⬇⬇⬇⬇⬇  #TheResistance  #ResistersForum  #PeoplesMarch                   ✊  ⬇⬇⬇⬇⬇⬇⬇⬇  #trumpCrimeFamily  #trumpIsACriminal  #trumpIsATraitor   #trumpObstructedJustice  #LiarInChief
Prediction: not hate
Actual: not hate

My b-day wish for @realDonaldTrump? May he choke on the most beautiful cake the world has ever seen!  #JohnMcCainDay   #ImpeachTrumpNow   #TrumpIsATraitor https://t.co/c8LnBYBAFq
Prediction: not hate
Actual: not hate

#Dickhead  Don't be one.   I give you mfs 1 minute of conversation followed by 4 minutes of laughter.  Foh &amp;  Good Day https://t.co/5isPM1bEhc
Prediction: not hate
Actual: hate

