In [142]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from hyperopt import fmin, tpe, hp, Trials, space_eval, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.metrics import classification_report
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.filterwarnings('ignore')

In [143]:
model_result = pd.DataFrame()

In [144]:
def change_directory(path):
    print("Current Working Directory ", os.getcwd())
    os.chdir(path)
    print("Changed Working Directory ", os.getcwd())

In [145]:
def read_data(file):
    data = pd.read_csv(file)
    return data

In [146]:
def tfidf_vector(x_train,x_test):
    tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
    tfidf_train=tfidf_vectorizer.fit_transform(x_train.values.astype('U')) 
    tfidf_test=tfidf_vectorizer.transform(x_test.values.astype('U'))
    tfidf_val=tfidf_vectorizer.transform(x_val.values.astype('U'))
    return tfidf_vectorizer, tfidf_train, tfidf_test, tfidf_val

In [147]:
def objective_lg(params):
    clf = LogisticRegression(**params)
    score = cross_val_score(clf, tfidf_train, y_train, cv=5, scoring='accuracy').mean()
    return -score  # Negative because Hyperopt minimizes the objective function    

In [148]:
def logistics_regression_model(tfidf_train, y_train):
    print('Building Logistics Regression Model...')
    space_lg = {
    'C': hp.loguniform('C', -5, 2),  # Regularization parameter
    'penalty': hp.choice('penalty', ['l1', 'l2']),  # Regularization type
    }
    print('Performing Hyperparametre Tunning..')
    best = fmin(fn=objective_lg, space=space_lg, algo=tpe.suggest, max_evals=10)
    best_params = {
    'C': best['C'],
    'penalty': ['l1', 'l2'][best['penalty']]
    }
    print('The best parametres are.. ', best_params)
    lr = LogisticRegression(**best_params)
    lr.fit(tfidf_train, y_train)
    return lr

In [149]:
def objective_pac(params):
    classifier = PassiveAggressiveClassifier(C=params['C'], max_iter=params['max_iter'], random_state=42)
    classifier.fit(tfidf_train, y_train)
    y_pred = classifier.predict(tfidf_test)
    accuracy = -accuracy_score(y_test, y_pred)  # Negative because Hyperopt minimizes
    return accuracy

In [150]:
def passive_agressive_classifier(tfidf_train, y_train):
    space_pac = {
    'C': hp.loguniform('C', np.log(0.01), np.log(10)),  # Regularization parameter
    'max_iter': hp.choice('max_iter', [50, 100, 200]),   # Maximum number of iterations
    }
    best = fmin(fn=objective_pac, space=space_pac, algo=tpe.suggest, max_evals=50)
    pac = PassiveAggressiveClassifier(max_iter=10, C=best['C'] ,random_state=42)
    pac.fit(tfidf_train, y_train)
    return pac

In [151]:
def objective_nb(params):
    classifier = MultinomialNB(alpha=params['alpha'])
    classifier.fit(tfidf_train, y_train)
    y_pred = classifier.predict(tfidf_test)
    accuracy = accuracy_score(y_test, y_pred)
    return {'loss': -accuracy, 'status': STATUS_OK}

In [152]:
def naive_bayes_classifier(tfidf_train, y_train):
    space_nb = {
    'alpha': hp.uniform('alpha', 0.1, 2.0)  # alpha range between 0.1 and 2.0
    }
    trials = Trials()
    best = fmin(fn=objective_nb, space=space_nb, algo=tpe.suggest, max_evals=50, trials=trials)
    clf = MultinomialNB(alpha = best['alpha'])
    clf.fit(tfidf_train, y_train)
    return clf

In [153]:
def cross_validation_function(model,tfidf_train,y_train):
    # Perform cross-validation
    scores = cross_val_score(model, tfidf_train, y_train, cv=5, scoring='accuracy')
    # Print average accuracy and standard deviation
    print("Cross-Validation Scores:", scores)
    print("Average Accuracy:", np.mean(scores))
    print("Standard Deviation:", np.std(scores))

In [154]:
def prediction_function(model, tfidf_train, tfidf_test, tfidf_val):
    ypred_train = model.predict(tfidf_train)
    ypred_test = model.predict(tfidf_test)
    ypred_val = model.predict(tfidf_val)
    return ypred_train, ypred_test, ypred_val

In [155]:
def performance_metric(actual,predicted):
    accuracy = accuracy_score(actual,predicted)
    f1 = f1_score(actual,predicted,average = "weighted")
    precision = precision_score(actual,predicted,average = "weighted")
    recall = recall_score(actual,predicted,average = "weighted")
    confusion = confusion_matrix(actual,predicted)
    return [accuracy, precision, recall, f1], confusion

In [156]:
def result_function(model_name, model, ypred_train, ypred_test, ypred_val, y_train, y_test, y_val):
    result = pd.DataFrame(columns = ['Measure','Model','Training','Testing','Validation'])
    result['Measure'] = ['Accuracy','F1 Score','Precision','Recall']
    result['Model'] = model_name
    train_result, confusion_tr = performance_metric(y_train, ypred_train)
    test_result, confusion_te = performance_metric(y_test, ypred_test)
    val_result, confusion_val = performance_metric(y_val, ypred_val)
    result['Training'] = train_result
    result['Testing'] = test_result
    result['Validation'] = val_result
    Pkl_filename = model_name + ".pkl"  
    pickle.dump(model, open(Pkl_filename, 'wb'))
    return result

In [157]:
path = "C:\\Users\\ompra\\OneDrive\\Documents\\Machine Learning Projects\\Automated Decision Support System for Cyberbullying Detection - Version 2.0\\Data\\processed_data"
change_directory(path)

Current Working Directory  C:\Users\ompra\OneDrive\Documents\Machine Learning Projects\Automated Decision Support System for Cyberbullying Detection - Version 2.0\Models
Changed Working Directory  C:\Users\ompra\OneDrive\Documents\Machine Learning Projects\Automated Decision Support System for Cyberbullying Detection - Version 2.0\Data\processed_data


In [158]:
file = 'train.csv'
train = read_data(file)
train.head()

Unnamed: 0,text,cyberbullying_type
0,ajittyagi sanjayazadsln atleast million muslim...,3
1,mkr,1
2,troonytoons funny sound like bitch,1
3,tired change name swear like copyrighted weath...,0
4,wow bird feather flock together roger stone ca...,2


In [159]:
file = 'test.csv'
test = read_data(file)
test.head()

Unnamed: 0,text,cyberbullying_type
0,tgccartoonist bitch as skinny boi,1
1,crushing disaster might tided kv Tui Te Shua F...,0
2,fear wont stop talking photoshoot,0
3,rt vickisecret nsw promo girl think way highly...,1
4,vampire hmmi wonder hows explained nba fan cal...,2


In [160]:
file = 'validation.csv'
valid = read_data(file)
valid.head()

Unnamed: 0,text,cyberbullying_type
0,way year since mac passed already that 's wild,0
1,we 're please retire maleprison rape joke beco...,1
2,speakermccarthy repeal tax cut bitch created h...,1
3,fuck dumb as cracker rt tayyoung fuck obama du...,2
4,traethompson oliviacornett say guy prolly life...,2


In [161]:
x_val = valid['text']
y_val = valid['cyberbullying_type']

In [162]:
x_train = train['text']
y_train = train['cyberbullying_type']

In [163]:
x_test = test['text']
y_test = test['cyberbullying_type']

In [164]:
tfidf_vectorizer, tfidf_train, tfidf_test, tfidf_val = tfidf_vector(x_train,x_test)

In [165]:
path = "C:\\Users\\ompra\\OneDrive\\Documents\\Machine Learning Projects\\Automated Decision Support System for Cyberbullying Detection - Version 2.0\\Models"
change_directory(path)

Current Working Directory  C:\Users\ompra\OneDrive\Documents\Machine Learning Projects\Automated Decision Support System for Cyberbullying Detection - Version 2.0\Data\processed_data
Changed Working Directory  C:\Users\ompra\OneDrive\Documents\Machine Learning Projects\Automated Decision Support System for Cyberbullying Detection - Version 2.0\Models


In [166]:
Pkl_filename = "tfidf.pkl"  
pickle.dump(tfidf_vectorizer, open(Pkl_filename, 'wb'))

In [167]:
model = logistics_regression_model(tfidf_train, y_train)

Building Logistics Regression Model...
Performing Hyperparametre Tunning..
100%|███████████████████████████████████████████████| 10/10 [07:02<00:00, 42.26s/trial, best loss: -0.9572070553870352]
The best parametres are..  {'C': 0.39337595215889637, 'penalty': 'l2'}


In [168]:
cross_validation_function(model,tfidf_train,y_train)

Cross-Validation Scores: [0.9570273  0.95663409 0.95803842 0.95579148 0.95854398]
Average Accuracy: 0.9572070553870352
Standard Deviation: 0.0009841744731187555


In [169]:
ypred_train, ypred_test, ypred_val = prediction_function(model, tfidf_train, tfidf_test, tfidf_val)

In [170]:
result = result_function('Logistic Regression', model, ypred_train, ypred_test, ypred_val, y_train, y_test, y_val)
model_result = model_result.append(result)
model_result

Unnamed: 0,Measure,Model,Training,Testing,Validation
0,Accuracy,Logistic Regression,0.969891,0.964307,0.965875
1,F1 Score,Logistic Regression,0.970786,0.965074,0.966668
2,Precision,Logistic Regression,0.969891,0.964307,0.965875
3,Recall,Logistic Regression,0.969873,0.964307,0.96583


In [171]:
model = passive_agressive_classifier(tfidf_train, y_train)

100%|███████████████████████████████████████████████| 50/50 [01:27<00:00,  1.76s/trial, best loss: -0.9788338388945063]


In [172]:
cross_validation_function(model,tfidf_train,y_train)

Cross-Validation Scores: [0.97803618 0.98062016 0.97977755 0.97753061 0.97865408]
Average Accuracy: 0.9789237164363556
Standard Deviation: 0.0011320869352801164


In [173]:
ypred_train, ypred_test, ypred_val = prediction_function(model, tfidf_train, tfidf_test, tfidf_val)

In [174]:
result = result_function('Passive Aggressive Classifier', model, ypred_train, ypred_test, ypred_val, y_train, y_test, y_val)
model_result = model_result.append(result)
model_result

Unnamed: 0,Measure,Model,Training,Testing,Validation
0,Accuracy,Logistic Regression,0.969891,0.964307,0.965875
1,F1 Score,Logistic Regression,0.970786,0.965074,0.966668
2,Precision,Logistic Regression,0.969891,0.964307,0.965875
3,Recall,Logistic Regression,0.969873,0.964307,0.96583
0,Accuracy,Passive Aggressive Classifier,0.990383,0.978227,0.980966
1,F1 Score,Passive Aggressive Classifier,0.990392,0.978225,0.980949
2,Precision,Passive Aggressive Classifier,0.990383,0.978227,0.980966
3,Recall,Passive Aggressive Classifier,0.99037,0.978206,0.980935


In [175]:
model = naive_bayes_classifier(tfidf_train, y_train)

100%|███████████████████████████████████████████████| 50/50 [00:04<00:00, 12.08trial/s, best loss: -0.8698685540950455]


In [176]:
cross_validation_function(model,tfidf_train,y_train)

Cross-Validation Scores: [0.86894731 0.87018313 0.86552073 0.86703741 0.86905966]
Average Accuracy: 0.8681496461071789
Standard Deviation: 0.001658173663857544


In [177]:
ypred_train, ypred_test, ypred_val = prediction_function(model, tfidf_train, tfidf_test, tfidf_val)

In [178]:
result = result_function('Naive_Bayes_Classifier', model, ypred_train, ypred_test, ypred_val, y_train, y_test, y_val)
model_result = model_result.append(result)
model_result

Unnamed: 0,Measure,Model,Training,Testing,Validation
0,Accuracy,Logistic Regression,0.969891,0.964307,0.965875
1,F1 Score,Logistic Regression,0.970786,0.965074,0.966668
2,Precision,Logistic Regression,0.969891,0.964307,0.965875
3,Recall,Logistic Regression,0.969873,0.964307,0.96583
0,Accuracy,Passive Aggressive Classifier,0.990383,0.978227,0.980966
1,F1 Score,Passive Aggressive Classifier,0.990392,0.978225,0.980949
2,Precision,Passive Aggressive Classifier,0.990383,0.978227,0.980966
3,Recall,Passive Aggressive Classifier,0.99037,0.978206,0.980935
0,Accuracy,Naive_Bayes_Classifier,0.948994,0.869869,0.869417
1,F1 Score,Naive_Bayes_Classifier,0.950082,0.87311,0.872254


In [179]:
path = "C:\\Users\\ompra\\OneDrive\\Documents\\Machine Learning Projects\\Automated Decision Support System for Cyberbullying Detection - Version 2.0\\Results"
change_directory(path)

Current Working Directory  C:\Users\ompra\OneDrive\Documents\Machine Learning Projects\Automated Decision Support System for Cyberbullying Detection - Version 2.0\Models
Changed Working Directory  C:\Users\ompra\OneDrive\Documents\Machine Learning Projects\Automated Decision Support System for Cyberbullying Detection - Version 2.0\Results


In [180]:
space_lg = {
    'C': hp.loguniform('C', -5, 2),  # Regularization parameter
    'penalty': hp.choice('penalty', ['l1', 'l2']),  # Regularization type
}

In [181]:
best = fmin(fn=objective_lg, space=space_lg, algo=tpe.suggest, max_evals=10)

 20%|█████████▌                                      | 2/10 [02:13<08:55, 66.91s/trial, best loss: -0.9751151556004943]


KeyboardInterrupt: 

In [None]:
best_params = {
    'C': best['C'],
    'penalty': ['l1', 'l2'][best['penalty']]
}

In [None]:
print('Best Params : ', best_params)

In [None]:
best_clf = LogisticRegression(**best_params)
best_clf.fit(tfidf_train, y_train)

In [None]:
yprediction = best_clf.predict(tfidf_train)
yprediction1 = best_clf.predict(tfidf_test)
yprediction2 = best_clf.predict(tfidf_val)

In [None]:
accuracy, precision, recall, f1, confusion = performance_metric(y_train, yprediction)
print("Accuracy : ",accuracy)
print("Precision : ",precision)
print("Recall : ", recall)
print("F1 score : ", f1)
print("Confusion Matrix")
sns.heatmap(confusion/np.sum(confusion), annot=True, fmt='.2%', cmap='Blues')

In [None]:
accuracy, precision, recall, f1, confusion = performance_metric(y_test, yprediction1)
print("Accuracy : ",accuracy)
print("Precision : ",precision)
print("Recall : ", recall)
print("F1 score : ", f1)
print("Confusion Matrix")
sns.heatmap(confusion/np.sum(confusion), annot=True, fmt='.2%', cmap='Blues')

In [None]:
accuracy, precision, recall, f1, confusion = performance_metric(y_val, yprediction2)
print("Accuracy : ",accuracy)
print("Precision : ",precision)
print("Recall : ", recall)
print("F1 score : ", f1)
print("Confusion Matrix")
sns.heatmap(confusion/np.sum(confusion), annot=True, fmt='.2%', cmap='Blues')

In [None]:
classifier = LogisticRegression(**best_params)

# Perform cross-validation
scores = cross_val_score(classifier, tfidf_train, y_train, cv=5, scoring='accuracy')

# Print average accuracy and standard deviation
print("Cross-Validation Scores:", scores)
print("Average Accuracy:", np.mean(scores))
print("Standard Deviation:", np.std(scores))

In [184]:
model_result.to_excel('Model_Result.xlsx', index = False)