In [1]:
# Utilities


import pandas as pd
from sklearn.metrics import *
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
df = pd.read_csv("CleanDataset.csv")
df.head()


Unnamed: 0,Toxicity,tweet,tweet_clean,tweet_tokenized,tweet_tok=2,tweet_lemmatized,preprocessed_text
0,0,@user when a father is dysfunctional and is so...,father dysfunctional selfish drags kids dysfun...,"['father', 'dysfunctional', 'selfish', 'drags'...","[('father', 'dysfunctional'), ('dysfunctional'...","['father', 'dysfunctional', 'selfish', 'drag',...",father dysfunctional selfish drag kid dysfunct...
1,0,@user @user thanks for #lyft credit i can't us...,thanks lyft credit can not use cause do not ...,"['thanks', 'lyft', 'credit', 'can', 'not', 'us...","[('thanks', 'lyft'), ('lyft', 'credit'), ('cre...","['thank', 'lyft', 'credit', 'can', 'not', 'use...",thank lyft credit can not use cause do not off...
2,0,bihday your majesty,bihday majesty,"['bihday', 'majesty']","[('bihday', 'majesty')]","['bihday', 'majesty']",bihday majesty
3,0,factsguide: society now #motivation,factsguide society motivation,"['factsguide', 'society', 'motivation']","[('factsguide', 'society'), ('society', 'motiv...","['factsguide', 'society', 'motivation']",factsguide society motivation
4,0,[2/2] huge fan fare and big talking before the...,huge fan fare big talking leave chaos pay disp...,"['huge', 'fan', 'fare', 'big', 'talking', 'lea...","[('huge', 'fan'), ('fan', 'fare'), ('fare', 'b...","['huge', 'fan', 'fare', 'big', 'talk', 'leave'...",huge fan fare big talk leave chaos pay dispute...


In [3]:
X = df['preprocessed_text']
y = df["Toxicity"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1000)

In [4]:

def model_evaluation(real_v, pred_v):
    print(f"Accuracy sore: {accuracy_score(real_v, pred_v)}")
    print("Classification report:")
    print(classification_report(real_v, pred_v))
    cm = confusion_matrix(real_v, pred_v)
    print (f"Confusion matrix \n {cm}")

In [5]:
vect = CountVectorizer(min_df=5,ngram_range = (1,2))  # Convert a collection of text documents to a matrix of token counts.
vect.fit(X_train)
X_train_tok = vect.fit_transform(X_train)
X_test_tok =vect.transform(X_test)

# LinearSVC

In [6]:
bin_pipeline = Pipeline([
    ('sel', SelectKBest(chi2, k=450)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', LinearSVC())  # learning algorithm
])

bin_pipeline.fit(X_train_tok,y_train)
bin_predictions = bin_pipeline.predict(X_test_tok)

In [7]:
model_evaluation(y_test,bin_predictions)

Accuracy sore: 0.9251261738963777
Classification report:
              precision    recall  f1-score   support

           0       0.91      0.96      0.93      8713
           1       0.95      0.88      0.91      6940

    accuracy                           0.93     15653
   macro avg       0.93      0.92      0.92     15653
weighted avg       0.93      0.93      0.92     15653

Confusion matrix 
 [[8403  310]
 [ 862 6078]]


# NAIVE BAYES

In [8]:
nb_bin_pipeline = Pipeline([
    ('sel', SelectKBest(chi2, k=1000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', MultinomialNB())  # learning algorithm
])

nb_bin_pipeline.fit(X_train_tok,y_train)
bin_predictions = nb_bin_pipeline.predict(X_test_tok)

In [9]:
model_evaluation(y_test,bin_predictions)

Accuracy sore: 0.8954194084201111
Classification report:
              precision    recall  f1-score   support

           0       0.92      0.89      0.90      8713
           1       0.87      0.90      0.88      6940

    accuracy                           0.90     15653
   macro avg       0.89      0.90      0.89     15653
weighted avg       0.90      0.90      0.90     15653

Confusion matrix 
 [[7786  927]
 [ 710 6230]]


# DECISION TREE

In [10]:
dt_bin_pipeline = Pipeline([
    ('sel', SelectKBest(chi2, k=450)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', DecisionTreeClassifier(criterion="gini",max_depth=25,min_samples_split=5,min_samples_leaf=5))  # learning algorithm
])

dt_bin_pipeline.fit(X_train_tok,y_train)
bin_predictions = dt_bin_pipeline.predict(X_test_tok)

In [11]:
model_evaluation(y_test,bin_predictions)

Accuracy sore: 0.9171404842522201
Classification report:
              precision    recall  f1-score   support

           0       0.89      0.97      0.93      8713
           1       0.95      0.86      0.90      6940

    accuracy                           0.92     15653
   macro avg       0.92      0.91      0.92     15653
weighted avg       0.92      0.92      0.92     15653

Confusion matrix 
 [[8417  296]
 [1001 5939]]


# KNN

In [12]:
knn_pipeline = Pipeline([
                ("sel", SelectKBest(chi2,k=100)),
                ("tfidf", TfidfTransformer()),
                ("learner", KNeighborsClassifier(weights="distance",metric="euclidean",n_neighbors=5))
                ])

knn_pipeline.fit(X_train_tok,y_train)
bin_predictions = knn_pipeline.predict(X_test_tok)

In [13]:
model_evaluation(y_test,bin_predictions)

Accuracy sore: 0.8968887753146362
Classification report:
              precision    recall  f1-score   support

           0       0.86      0.97      0.91      8713
           1       0.95      0.81      0.87      6940

    accuracy                           0.90     15653
   macro avg       0.91      0.89      0.89     15653
weighted avg       0.90      0.90      0.90     15653

Confusion matrix 
 [[8426  287]
 [1327 5613]]
