# Import Basic Libraries

In [1]:
import sklearn
import numpy as np
import pandas as pd

# Import Data

In [2]:
# training data
cols = ['id','label','tweet']
train = pd.read_csv("train.csv",names=cols,skiprows=1)


In [3]:
# non HATE
sum(train["label"] == 0)

29720

In [4]:
# Hate
sum(train["label"] == 1)

2242

In [5]:
# check if there are any missing values
print(train.isnull().sum())

id       0
label    0
tweet    0
dtype: int64


# Data cleaning

In [6]:
#install tweet-preprocessor to clean tweets
# !pip install tweet-preprocessor

In [7]:
# remove special characters using the regular expression library
import re

#set up punctuations we want to be replaced
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")

In [8]:
import preprocessor as p

In [9]:
# custum function to clean the dataset (combining tweet_preprocessor and reguar expression)
def tweetCleaner(df):
  temp = []
  for tweet in df:
    tmpT = p.clean(tweet)
    tmpT = REPLACE_NO_SPACE.sub("", tmpT.lower())
    tmpT = REPLACE_WITH_SPACE.sub(" ", tmpT)
    temp.append(tmpT)
  return temp

In [10]:
# clean training data
train_tweet = tweetCleaner(train["tweet"])
train_tweet = pd.DataFrame(train_tweet)

In [11]:
# append cleaned tweets to the training data
train["clean_tweet"] = train_tweet

# Test and Train split


In [12]:
from sklearn.model_selection import train_test_split



y = train.label.values


# use 70% for the training and 30% for the test
x_train, x_test, y_train, y_test = train_test_split(train.clean_tweet.values, y, 
                                                    stratify=y, 
                                                    random_state=1, 
                                                    test_size=0.3)

# Vectorize tweets using CountVectorizer

CountVectorizer Example

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
documents = ["NLP USING AI",
             "HATRE DETECTION PROJECT AT FAST USING AI",
             "FAST NUCES WORKING ON AI PROJECT"]

# initializing the countvectorizer
vectorizer = CountVectorizer()

# tokenize and make the document into a matrix
document_term_matrix = vectorizer.fit_transform(documents)

# check the result
pd.DataFrame(document_term_matrix.toarray(), columns = vectorizer.get_feature_names())

Unnamed: 0,ai,at,detection,fast,hatre,nlp,nuces,on,project,using,working
0,1,0,0,0,0,1,0,0,0,1,0
1,1,1,1,1,1,0,0,0,1,1,0
2,1,0,0,1,0,0,1,1,1,0,1


In [15]:
from sklearn.feature_extraction.text import CountVectorizer

# vectorize tweets for model building
vectorizer = CountVectorizer(binary=True, stop_words='english')


# learn a vocabulary dictionary of all tokens in the raw documents
vectorizer.fit(list(x_train) + list(x_test))


# # transform documents to document-term matrix
x_train_vec = vectorizer.transform(x_train)
x_test_vec = vectorizer.transform(x_test)
print("done")

done


# Model building

Apply Support Vetor Classifier (SVC)

In [16]:
from sklearn import svm
# classify using support vector classifier
svm = svm.SVC(kernel = 'linear', probability=True)

# fit the SVC model based on the given training data
prob = svm.fit(x_train_vec, y_train).predict_proba(x_test_vec)

# perform classification and prediction on samples in x_test
y_pred_svm = svm.predict(x_test_vec)
print("done")

done


# Accuracy score for SVC


In [17]:
from sklearn.metrics import accuracy_score
print("Accuracy score for SVC is: ", accuracy_score(y_test, y_pred_svm) * 100, '%')

Accuracy score for SVC is:  94.86912086766085 %
