# OFFENSIVE LANGUAGE DETECTION

# Importing the libraries required

In [1]:
import re
import pandas as pd
import pickle

In [2]:
from pprint import pprint
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

# Ignoring the warnings

In [3]:
#warnings are ignored which may come during the running of code
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

# Importing the dataset which is tab separated 

In [4]:
#importing dataset by using pandas with tab as the delimiter
dataset = pd.read_csv('offenseval-training-v1.tsv', delimiter='\t')
#creating a list of words which should be removed from the tweets
mustBeRemovedList = ["@USER", "url"]

In [5]:
#displaying the dataset
dataset

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,
...,...,...,...,...,...
13235,95338,@USER Sometimes I get strong vibes from people...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,@USER And why report this garbage. We don't g...,OFF,TIN,OTH
13238,27429,@USER Pussy,OFF,UNT,


In [6]:
#displaying the words in list 
mustBeRemovedList

['@USER', 'url']

# Making functions for text processing

In [7]:
#Removing the user tag
def remove_userTag():
    datasetwithoutUserTag = []
    for line in dataset['tweet']:
        finalListOfWords = []
        tweets = []
        words = line.split()
        for word in words:
            if word not in mustBeRemovedList:
                finalListOfWords.append(word)
        tweets = " ".join(finalListOfWords)
        datasetwithoutUserTag.append(tweets)
    return datasetwithoutUserTag

In [8]:
#creating a list of words which are stop words
noise_list = set(stopwords.words("english"))
# noise detection
def remove_noise(input_text):
    words = word_tokenize(input_text)
    noise_free_words = list()
    i = 0;
    for word in words:
        if word.lower() not in noise_list:
            noise_free_words.append(word)
        i += 1
    noise_free_text = " ".join(noise_free_words)
    return noise_free_text

In [9]:
#Performing lemmatization
def lemetize_words(input_text):
    words = word_tokenize(input_text)
    new_words = []
    lem = WordNetLemmatizer()
    for word in words:
        word = lem.lemmatize(word, "v")
        new_words.append(word)
    new_text = " ".join(new_words)
    return new_text

In [10]:
#cleaning the dataset
def cleaning():
    corpus = []
    datasetwithoutUserTag = remove_userTag()
    for line in datasetwithoutUserTag:
        review = re.sub('[^a-zA-Z]', ' ', line)
        review = review.lower()
        # remove non segnificant words
        review = remove_noise(review)
        review = lemetize_words(review)
        corpus.append(review)
    return corpus

In [11]:
#creating a bag of words
def bagOfWordsCreation(corpus):
    cv = CountVectorizer(max_features=12000)
    bagOfWords = cv.fit_transform(corpus).toarray()
    rowsValues = []
    for line in dataset['subtask_a']:
        if line == "OFF":
            rowsValues.append(1)
        else:
            rowsValues.append(0)
    return (bagOfWords, rowsValues)

# Creating the classifier

In [12]:
#making the classifier
def classifiers(classifier):
    # fitting classifer to the training set
    classifier_to_save = classifier.fit(bagOfWords_train, rowsValues_train)

    # predict the test set resulty
    rowsValues_pred = classifier.predict(bagOfWords_train)
    # confusion matrix
    cm = confusion_matrix(rowsValues_train, rowsValues_pred)
    print('confusuion matrix train before tunning\n', cm)
    accuracyTrain = (cm[0][0] + cm[1][1]) / len(rowsValues_train)

    rowsValues_pred = classifier.predict(bagOfWords_test)
    cm = confusion_matrix(rowsValues_test, rowsValues_pred)
    print('confusuion matrix test before tunning\n', cm)
    accuracyTest = (cm[0][0] + cm[1][1]) / len(rowsValues_test)

    return accuracyTrain, accuracyTest, classifier_to_save

In [13]:
#saving the classifier as pickle file
def save_classifier(classifier_name, classifier_s):
    save_classifier = open(classifier_name + ".pickle", "wb")
    pickle.dump(classifier_s, save_classifier)
    save_classifier.close()
    return

In [14]:
#using the saved classifier before tuning
def use_saved_classifierBeforeTunning(classifier_name):
    classifier_f = open(classifier_name + ".pickle", "rb")
    classifier = pickle.load(classifier_f)
    classifier_f.close()

    # predict the test set resulty
    rowsValues_pred = classifier.predict(bagOfWords_train)
    # confusion matrix
    cm = confusion_matrix(rowsValues_train, rowsValues_pred)
    print('confusuion matrix train before tunning\n', cm)
    accuracyTrain = (cm[0][0] + cm[1][1]) / len(rowsValues_train)

    rowsValues_pred = classifier.predict(bagOfWords_test)
    cm = confusion_matrix(rowsValues_test, rowsValues_pred)
    print('confusuion matrix test before tunning\n', cm)
    accuracyTest = (cm[0][0] + cm[1][1]) / len(rowsValues_test)

    return accuracyTrain, accuracyTest

In [15]:
#using the saved classifier after tuning
def use_saved_classifierAfterTunning(classifier_name):
    classifier_f = open(classifier_name + ".pickle", "rb")
    classifier = pickle.load(classifier_f)
    classifier_f.close()

    # predict the test set result
    rowsValues_pred = classifier.predict(bagOfWords_train)
    
    # creating the confusion matrix
    cm = confusion_matrix(rowsValues_train, rowsValues_pred)
    print('confusuion matrix train after tunning\n', cm)
    accuracyTrain = (cm[0][0] + cm[1][1]) / len(rowsValues_train)

    rowsValues_pred = classifier.predict(bagOfWords_test)
    cm = confusion_matrix(rowsValues_test, rowsValues_pred)
    print('confusuion matrix test after tunning\n', cm)
    accuracyTest = (cm[0][0] + cm[1][1]) / len(rowsValues_test)

    return accuracyTrain, accuracyTest

In [16]:
#creating the random grid
def create_parameter_grid_LogisticRegression():
    # Inverse of regularization strength; must be a positive float.
    # Like in support vector machines, smaller values specify stronger regularization
    C = [float(x) for x in pd.np.linspace(start=0.1, stop=5.0)]

    # Create the random grid
    random_grid = {'C':  C,
                   }
    print('random_grid')
    pprint(random_grid)
    return random_grid

In [17]:
def random_search_training(randomGrid,classifier):
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = classifier
    # Random search of parameters, using 3 fold cross validation,
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator=rf, param_distributions=randomGrid, n_iter=30, cv=3, verbose=0,
                                   random_state=42, n_jobs=-1, refit=True)
    # Fit the random search model
    rf_random.fit(bagOfWords_train, rowsValues_train)
    print('rf_random.best_params')
    var = rf_random.best_params_
    print(var)
    best_random = rf_random.best_estimator_
    return best_random

In [18]:
#evaluating the model
def evaluate(classifier):
    rowsValues_pred = classifier.predict(bagOfWords_test)
    cm = confusion_matrix(rowsValues_test, rowsValues_pred)
    print('confusuion matrix test\n', cm)
    accuracyTest = (cm[0][0] + cm[1][1]) / len(rowsValues_test)
    return accuracyTest

In [19]:
#saving our classifier 
def saveLogisticRegClassiferBeforeAndAfterTunning():
    #Find accuracy of training and validation data before tunning
    accuracyTrain, accuracyTest, classifier_s = classifiers(LogisticRegression())
    save_classifier("logisticRegression", classifier_s)
    #tunning process
    random_Grid_logistic = create_parameter_grid_LogisticRegression()
    logistic_classifier_s_t = random_search_training(random_Grid_logistic,LogisticRegression())
    save_classifier("LogisticRegressionTuned", logistic_classifier_s_t)
    return

# Building corpus and evaluating model

In [20]:
#building the corpus
corpus = cleaning()
bagOfWords, rowsValues = bagOfWordsCreation(corpus)
# splitting data into training and testing data
bagOfWords_train, bagOfWords_test, rowsValues_train, rowsValues_test = train_test_split(bagOfWords, rowsValues,
                                                                                        test_size=0.2, random_state=0)

In [21]:
#Logistic Regrission Classifier
print('LogisticRegression')

#saveLogisticRegClassiferBeforeAndAfterTunning()

LogisticRegression


In [22]:
#use saved before tunning classifier to predicit training and test sets
accuracyTrain2, accuracyTest2 = use_saved_classifierBeforeTunning("logisticRegression")

confusuion matrix train before tunning
 [[6852  182]
 [ 835 2723]]
confusuion matrix test before tunning
 [[1615  191]
 [ 419  423]]


In [23]:
#use saved tuned classifier to predicit training and test sets
accuracyTrainTuned2, accuracyTestTuned2 = use_saved_classifierAfterTunning("LogisticRegressionTuned")

confusuion matrix train after tunning
 [[6824  210]
 [1127 2431]]
confusuion matrix test after tunning
 [[1636  170]
 [ 446  396]]


In [24]:
print('accuracy for train set after saving base classifier = ', accuracyTrain2)
print('accuracy for test set after saving base classifier = ', accuracyTest2)
print('accuracy for train set after saving tuned classifier = ', accuracyTrainTuned2)
print('accuracy for test set after saving tuned classifier  = ', accuracyTestTuned2)


accuracy for train set after saving base classifier =  0.9039841389728097
accuracy for test set after saving base classifier =  0.7696374622356495
accuracy for train set after saving tuned classifier =  0.8737726586102719
accuracy for test set after saving tuned classifier  =  0.7673716012084593
