In [1]:
# Utilities


import pandas as pd

from sklearn.metrics import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, KFold, RandomizedSearchCV, RepeatedKFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer


from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from sklearn.feature_selection import SelectKBest, chi2

In [2]:
df = pd.read_csv("CleanDataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Toxicity,tweet,tweet_clean,tweet_tokenized,tweet_tok=2,tweet_lemmatized,preprocessed_text
0,0,0,@user when a father is dysfunctional and is so...,father dysfunctional selfish drags kids dysfun...,"['father', 'dysfunctional', 'selfish', 'drags'...","[('father', 'dysfunctional'), ('dysfunctional'...","['father', 'dysfunctional', 'selfish', 'drag',...",father dysfunctional selfish drag kid dysfunct...
1,1,0,@user @user thanks for #lyft credit i can't us...,thanks lyft credit can not use cause do not ...,"['thanks', 'lyft', 'credit', 'can', 'not', 'us...","[('thanks', 'lyft'), ('lyft', 'credit'), ('cre...","['thank', 'lyft', 'credit', 'can', 'not', 'use...",thank lyft credit can not use cause do not off...
2,2,0,bihday your majesty,bihday majesty,"['bihday', 'majesty']","[('bihday', 'majesty')]","['bihday', 'majesty']",bihday majesty
3,4,0,factsguide: society now #motivation,factsguide society motivation,"['factsguide', 'society', 'motivation']","[('factsguide', 'society'), ('society', 'motiv...","['factsguide', 'society', 'motivation']",factsguide society motivation
4,5,0,[2/2] huge fan fare and big talking before the...,huge fan fare big talking leave chaos pay disp...,"['huge', 'fan', 'fare', 'big', 'talking', 'lea...","[('huge', 'fan'), ('fan', 'fare'), ('fare', 'b...","['huge', 'fan', 'fare', 'big', 'talk', 'leave'...",huge fan fare big talk leave chaos pay dispute...


In [3]:
X = df['preprocessed_text']
y = df["Toxicity"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1000)

In [4]:
vect = CountVectorizer(min_df=5,ngram_range = (1,2))  # Convert a collection of text documents to a matrix of token counts.
vect.fit(X_train)
X_train_vec = vect.fit_transform(X_train)
X_test_vec =vect.transform(X_test)

In [5]:

def model_evaluation(real_v, pred_v):
    print(f"Accuracy sore: {accuracy_score(real_v, pred_v)}")
    print("Classification report:")
    print(classification_report(real_v, pred_v))
    cm = confusion_matrix(real_v, pred_v)
    print (f"Confusion matrix \n {cm}")

In [6]:
def customRandomSearch(X_train, y_train, model, tuned_parameters):

    print("____________________________________________ START GRID SEARCH ____________________________________________")

    results = {}
    
    print("------- Score = F1_MACRO ------- \n")
        
    k_fold = KFold(n_splits=5)
    print("> Fold = " + str(k_fold) + "\n")


    clf = RandomizedSearchCV(model, tuned_parameters, error_score='raise', cv=k_fold, scoring = 'f1_macro', return_train_score=True,n_jobs=-1,n_iter=100)
    clf.fit(X_train, y_train)

    print("> Best Parameter set: \n")
    best = clf.best_params_
    print(best)
        
    print("\n> Grid scores:\n")

    means = clf.cv_results_['mean_train_score']
    stds = clf.cv_results_['std_train_score']

    print("...........RESULTS FOR TRAINING.........")
    print("........................................")

   
    
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
                          
    print("____________________________________________ END GRID SEARCH ____________________________________________")
        
    results['f1_macro'] = best
    
    return results

In [7]:
def customGridSearch(X_train, y_train, model, tuned_parameters):

    print("____________________________________________ START GRID SEARCH ____________________________________________")

    results = {}
    
    print("------- Score = F1_MACRO ------- \n")
        
    k_fold = KFold(n_splits=5)
    print("> Fold = " + str(k_fold) + "\n")
        
    clf = GridSearchCV(model, tuned_parameters, error_score='raise', cv=k_fold, scoring = 'f1_macro',n_josb=-1,return_train_score=True)
    clf.fit(X_train, y_train)

    print("> Best Parameter set: \n")
    best = clf.best_params_
    print(best)
        
    print("\n> Grid scores:\n")

    means = clf.cv_results_['mean_train_score']
    stds = clf.cv_results_['std_train_score']

    print("...........RESULTS FOR TRAINING.........")
    print("........................................")

   
    
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
                          
    print("____________________________________________ END GRID SEARCH ____________________________________________")
        
    results['f1_macro'] = best
    
    return results


# KNN

In [16]:
check_params = {"selbestk__k": [200, 300, 500, 800, 1000],
          'knn__n_neighbors': list(range(5,28,2)),
          #'knn__weights': ['uniform', 'distance'],
          'knn__metric': ["euclidean", "manhattan"]}

clf = Pipeline([
                ("selbestk", SelectKBest(score_func = chi2)),
                ("tfidf", TfidfTransformer()),
                ("knn", KNeighborsClassifier())
                ])

results = customRandomSearch(X_train_vec, y_train , clf, check_params)

____________________________________________ START GRID SEARCH ____________________________________________
------- Score = F1_MACRO ------- 

> Fold = KFold(n_splits=5, random_state=None, shuffle=False)

 Start combinations
 Finish combinations
> Best Parameter set: 

{'selbestk__k': 200, 'knn__n_neighbors': 7, 'knn__metric': 'manhattan'}

> Grid scores:

...........RESULTS FOR TRAINING.........
........................................
0.892 (+/-0.004) for {'selbestk__k': 200, 'knn__n_neighbors': 27, 'knn__metric': 'manhattan'}
0.820 (+/-0.006) for {'selbestk__k': 1000, 'knn__n_neighbors': 21, 'knn__metric': 'euclidean'}
0.856 (+/-0.006) for {'selbestk__k': 800, 'knn__n_neighbors': 17, 'knn__metric': 'euclidean'}
0.909 (+/-0.005) for {'selbestk__k': 300, 'knn__n_neighbors': 5, 'knn__metric': 'euclidean'}
0.907 (+/-0.007) for {'selbestk__k': 500, 'knn__n_neighbors': 5, 'knn__metric': 'manhattan'}
0.896 (+/-0.003) for {'selbestk__k': 300, 'knn__n_neighbors': 15, 'knn__metric': 'manhatta

In [9]:
sel = SelectKBest(chi2, k=200)  
sel.fit(X_train_vec,y_train)
X_train_sel = sel.transform(X_train_vec)
X_test_sel = sel.transform(X_test_vec)

tfidf = TfidfTransformer()
tfidf.fit(X_train_sel)
X_train_vec_bestK = tfidf.transform(X_train_sel)
X_test_vec_bestK =tfidf.transform(X_test_sel)

learner = KNeighborsClassifier(n_neighbors = 7, metric = 'manhattan')
classifier = learner.fit(X_train_vec_bestK, y_train)
predictions = classifier.predict(X_test_vec_bestK)

In [10]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.8964415766945634
Classification report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      8713
           1       0.93      0.82      0.88      6940

    accuracy                           0.90     15653
   macro avg       0.90      0.89      0.89     15653
weighted avg       0.90      0.90      0.90     15653

Confusion matrix 
 [[8308  405]
 [1216 5724]]
