In [2]:
# Importing Libraries
import json
import string
import random 
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm 
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn import svm
from sklearn.svm import SVC

# Loading Dataset 
df = pd.read_csv('Dataset_Cognitive_Distortions.tsv', sep='\t', header=0)
#df.shape

# Divide data between training and test data
X = df['Phrase']
y = df['Cognitive Distortion']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .8)


MULTINOMIAL NAIVE BAYES

In [None]:
textclassifier = Pipeline([
  ('vect', CountVectorizer()),
   ('tfidf', TfidfTransformer()),
   ('smote', SMOTE(random_state=0)),
   ('mnb', MultinomialNB())
])

params = {'smote__k_neighbors': [2,3,4,5,6,7,8,9,10, ],
          'mnb__alpha': [0.01, 0.1, 0.3, 0.5, 1.0]}
          
multinomial_nb_grid = GridSearchCV(estimator=textclassifier, param_grid=params, n_jobs=10, cv=10, verbose=5)
multinomial_nb_grid.fit(X_train, y_train)


print('Train Accuracy : %.3f'%multinomial_nb_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%multinomial_nb_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%multinomial_nb_grid.best_score_)
print('Best Parameters : ',multinomial_nb_grid.best_params_)

y_pred = multinomial_nb_grid.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

MULTINOMIAL LOGISTIC REGRESSION

In [None]:
textclassifier =Pipeline([
  ('vect', CountVectorizer()),
   ('tfidf', TfidfTransformer()),
   ('smote', SMOTE(random_state=0)),
   ('mlg', LogisticRegression(multi_class='multinomial', random_state=0, warm_start = True,  l1_ratio = 0.5))
])

params = {'smote__k_neighbors': [2,3,4,5,6,7,8,9,10, ],
          'mlg__penalty': ['l1', 'l2', 'elasticnet', 'none' ],
          'mlg__C': [0.01, 0.1, 0.3, 0.5, 1.0,],
          'mlg__solver': ['lbfgs', 'newton-cg'
                          , 'sag', 'saga'],
          }
          
multinomial_lg_grid = GridSearchCV(estimator=textclassifier, param_grid=params, n_jobs=10, cv=10, verbose=5)
multinomial_lg_grid.fit(X_train, y_train)


print('Train Accuracy : %.3f'%multinomial_lg_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%multinomial_lg_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%multinomial_lg_grid.best_score_)
print('Best Parameters : ',multinomial_lg_grid.best_params_)

y_pred = multinomial_lg_grid.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

SVM

In [None]:
textclassifier =Pipeline([
  ('vect', CountVectorizer()),
   ('tfidf', TfidfTransformer()),
   ('smote', SMOTE(random_state=0)),
   ('svm', svm.SVC(probability = True))
])

params = {'smote__k_neighbors': [2,3,4,5,6,7,8,9,10,],
        'svm__decision_function_shape': ['ovo', 'ovr'], 
        'svm__C': [0.01, 0.1, 0.3, 0.5, 1.0,],
}

svm_grid = GridSearchCV(estimator=textclassifier, param_grid=params, n_jobs=60, cv=10, verbose=5)
svm_grid.fit(X_train, y_train)


print('Train Accuracy : %.3f'%svm_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%svm_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%svm_grid.best_score_)
print('Best Parameters : ',svm_grid.best_params_)

y_pred = svm_grid.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

RANDOM FORESTS

In [None]:
textclassifier =Pipeline([
  ('vect', CountVectorizer()),
   ('tfidf', TfidfTransformer()),
   ('smote', SMOTE(random_state=0)),
   ('rf', RandomForestClassifier(random_state=0))
])

params = {'smote__k_neighbors': [2,3,4,5,6,7,8,9,10, ]}
          

rf_grid = GridSearchCV(estimator=textclassifier, param_grid=params, n_jobs=10, cv=10, verbose=5)
rf_grid.fit(X_train, y_train)


print('Train Accuracy : %.3f'%rf_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%rf_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%rf_grid.best_score_)
print('Best Parameters : ',rf_grid.best_params_)

y_pred = rf_grid.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

SAVING BEST MODEL

In [None]:
pickle.dump(textclassifier, open('cognitive_distortion_detector_model.pkl', 'wb'))

KNN

In [None]:
textclassifier =Pipeline([
  ('vect', CountVectorizer()),
   ('tfidf', TfidfTransformer()),
   ('smote', SMOTE(random_state=0)),
   ('knn', KNeighborsClassifier())
])

params = {'knn__n_neighbors': [5,6,7,8,9,10, ],
          'knn__weights': ['uniform', 'distance'],
          'smote__k_neighbors': [2,3,4,5,6,7,8,9,10, ],
          'knn__p': [1, 2, 3],
          }
          
knn_grid = GridSearchCV(estimator=textclassifier, param_grid=params, n_jobs=10, cv=10, verbose=5)
knn_grid.fit(X_train, y_train)


print('Train Accuracy : %.3f'%knn_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%knn_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%knn_grid.best_score_)
print('Best Parameters : ',knn_grid.best_params_)

y_pred = knn_grid.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))