In [None]:
!pip install sentence_transformers
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

# **1. Extracting SentenceBert Embeddings**

In [17]:
train_data = pd.read_csv('/content/train.csv')
train_data

Unnamed: 0.1,Unnamed: 0,Text,label
0,0,I can't shake off this constant sense of hopel...,Depression
1,1,I'm constantly second-guessing myself and my d...,Anxiety Disorder
2,2,"I'm feeling physically unwell, but I know it's...",Depression
3,3,I'm desperate to escape the overwhelming fear.,Panic Disorder
4,4,It's hard to describe the sensation of being t...,Panic Disorder
...,...,...,...
780,780,I'm not good enough for anything.,Depression
781,781,"Feeling like I'm a trailblazer, forging a path...",Narcissistic Disorder
782,782,Trying to maintain my self-assuredness without...,Narcissistic Disorder
783,783,Feeling like I have the potential to influence...,Narcissistic Disorder


In [None]:
# Load a pre-trained SBERT model (e.g., "paraphrase-MiniLM-L6-v2")
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Preprocess your sentences (if needed)
# For example, you can convert all Text to lowercase and remove punctuation
train_data['Text'] = train_data['Text'].str.lower().str.replace('[^\w\s]', '')

# Compute sentence embeddings
sentence_embeddings = model.encode(train_data['Text'].tolist(), show_progress_bar=True)

# Create a new column for sentence embeddings in your train_data
train_data['SentenceBertEmbeddings'] = [list(embedding) for embedding in sentence_embeddings]

# Save the updated train_data to a new CSV file
train_data.to_csv('SentenceBertEmbeddings.csv', index=False)  # Replace 'your_updated_train_data.csv' with your desired file name

In [None]:
# Define a mapping of text labels to integer labels
label_mapping = {
    'Depression': 0,
    'Anxiety Disorder': 1,
    'Panic Disorder': 2,
    'Narcissistic Disorder': 3,
    'Anger/ Intermittent Explosive Disorder': 4,
}

# Apply the mapping to the 'label' column
train_data['label'] = train_data['label'].map(label_mapping)

In [20]:
train_data = train_data.drop(columns=['Unnamed: 0'])
train_data.head()

Unnamed: 0,Text,label,SentenceBertEmbeddings
0,i cant shake off this constant sense of hopele...,0,"[-0.013776169, -0.28504047, 0.38752916, -0.044..."
1,im constantly secondguessing myself and my dec...,1,"[0.30716705, 0.10577666, 0.04163367, -0.106705..."
2,im feeling physically unwell but i know its no...,0,"[0.5268184, -0.59319836, 0.23147163, 0.3869896..."
3,im desperate to escape the overwhelming fear,2,"[0.52445537, 0.61214525, 0.069472775, 0.033327..."
4,its hard to describe the sensation of being tr...,2,"[0.26569507, 0.042906933, 0.075957276, 0.40910..."


#  **2. Applying ML and DL models on SentenceBert Embeddings.**

In [27]:
# Split train_data into features and labels
X = list(train_data['SentenceBertEmbeddings'])
y = train_data['label']

# Split train_data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Decision Trees
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)
print("Decision Trees Accuracy:", dt_accuracy)

# 2. Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print("Random Forest Accuracy:", rf_accuracy)

# 3. XGBoost
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_pred)
print("XGBoost Accuracy:", xgb_accuracy)

# 4. Neural Network (MLP)
nn_model = MLPClassifier(random_state=42)
nn_model.fit(X_train, y_train)
nn_pred = nn_model.predict(X_test)
nn_accuracy = accuracy_score(y_test, nn_pred)
print("Neural Network (MLP) Accuracy:", nn_accuracy)

# 5. Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)
print("Logistic Regression Accuracy:", lr_accuracy)

# 6. K-Nearest Neighbors (KNN)
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_pred)
print("K-Nearest Neighbors (KNN) Accuracy:", knn_accuracy)

Decision Trees Accuracy: 0.5668789808917197
Random Forest Accuracy: 0.7070063694267515
XGBoost Accuracy: 0.6496815286624203
Neural Network (MLP) Accuracy: 0.7388535031847133
Logistic Regression Accuracy: 0.7515923566878981
K-Nearest Neighbors (KNN) Accuracy: 0.7133757961783439


# **3. Doing GridSearch CV to find the Best Hyperparameters.**

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Define hyperparameter grids for each model
param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    # Add other XGBoost hyperparameters
}

param_grid_nn = {
    'hidden_layer_sizes': [(64,), (128,), (256,)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01]
    # Add other MLP hyperparameters
}

param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear', 'sag', 'saga']
    # Add other Logistic Regression hyperparameters
}

param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'p': [1, 2]
    # Add other KNN hyperparameters
}

# Create a list of models and their respective hyperparameter grids
models = [
    (DecisionTreeClassifier(), param_grid_dt),
    (RandomForestClassifier(), param_grid_rf),
    (xgb.XGBClassifier(), param_grid_xgb),
    (MLPClassifier(), param_grid_nn),
    (LogisticRegression(), param_grid_lr),
    (KNeighborsClassifier(), param_grid_knn)
]

# Perform hyperparameter tuning for each model
for model, param_grid in models:
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters
    print(f"Best Hyperparameters for {model.__class__.__name__}:", grid_search.best_params_)

    # Use the best model for prediction
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model.__class__.__name__} Accuracy (Tuned):", accuracy)


Best Hyperparameters for DecisionTreeClassifier: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
DecisionTreeClassifier Accuracy (Tuned): 0.5859872611464968
Best Hyperparameters for RandomForestClassifier: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
RandomForestClassifier Accuracy (Tuned): 0.7197452229299363
Best Hyperparameters for XGBClassifier: {'learning_rate': 0.2, 'max_depth': 3, 'subsample': 0.8}
XGBClassifier Accuracy (Tuned): 0.7006369426751592




Best Hyperparameters for MLPClassifier: {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (256,)}
MLPClassifier Accuracy (Tuned): 0.7261146496815286


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Hyperparameters for LogisticRegression: {'C': 0.1, 'solver': 'liblinear'}
LogisticRegression Accuracy (Tuned): 0.7388535031847133
Best Hyperparameters for KNeighborsClassifier: {'n_neighbors': 7, 'p': 2}
KNeighborsClassifier Accuracy (Tuned): 0.6878980891719745


# **3. Doing RandomSearch CV to find the Best Hyperparameters.**

In [29]:
from scipy.stats import randint, uniform

# Define hyperparameter distributions for each model
param_dist_dt = {
    'max_depth': randint(1, 30),  # Random integers from 1 to 30
    'min_samples_split': randint(2, 11),  # Random integers from 2 to 10
    'min_samples_leaf': randint(1, 5)  # Random integers from 1 to 4
}

param_dist_rf = {
    'n_estimators': randint(100, 500),  # Random integers from 100 to 499
    'max_depth': randint(1, 30),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

param_dist_xgb = {
    'learning_rate': uniform(0.01, 0.2),  # Random float between 0.01 and 0.2
    'max_depth': randint(3, 6),  # Random integers from 3 to 5
    'subsample': uniform(0.8, 0.2)  # Random float between 0.8 and 1.0
    # Add other XGBoost hyperparameters and distributions
}

param_dist_nn = {
    'hidden_layer_sizes': [(64,), (128,), (256,)],
    'activation': ['relu', 'tanh'],
    'alpha': uniform(0.0001, 0.01)  # Random float between 0.0001 and 0.01
    # Add other MLP hyperparameters and distributions
}

param_dist_lr = {
    'C': uniform(0.001, 10),  # Random float between 0.001 and 10
    'solver': ['lbfgs', 'liblinear', 'sag', 'saga']
    # Add other Logistic Regression hyperparameters and distributions
}

param_dist_knn = {
    'n_neighbors': randint(3, 10),  # Random integers from 3 to 9
    'p': [1, 2]
    # Add other KNN hyperparameters and distributions
}

# Create a list of models and their respective hyperparameter distributions
models = [
    (DecisionTreeClassifier(), param_dist_dt),
    (RandomForestClassifier(), param_dist_rf),
    (xgb.XGBClassifier(), param_dist_xgb),
    (MLPClassifier(), param_dist_nn),
    (LogisticRegression(), param_dist_lr),
    (KNeighborsClassifier(), param_dist_knn)
]

In [30]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np  # Import numpy for random seed

# Define the number of random samples
n_iter = 10  # Adjust as needed

# Create a list of models and their respective hyperparameter distributions
models = [
    (DecisionTreeClassifier(), param_grid_dt),
    (RandomForestClassifier(), param_grid_rf),
    (xgb.XGBClassifier(), param_grid_xgb),
    (MLPClassifier(), param_grid_nn),
    (LogisticRegression(), param_grid_lr),
    (KNeighborsClassifier(), param_grid_knn)
]

# Perform hyperparameter tuning with RandomizedSearchCV for each model
for model, param_dist in models:
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter, cv=5, scoring='accuracy', random_state=42)
    random_search.fit(X_train, y_train)

    # Print the best hyperparameters
    print(f"Best Hyperparameters for {model.__class__.__name__}:", random_search.best_params_)

    # Use the best model for prediction
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model.__class__.__name__} Accuracy (Tuned):", accuracy)


Best Hyperparameters for DecisionTreeClassifier: {'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 20}
DecisionTreeClassifier Accuracy (Tuned): 0.6114649681528662
Best Hyperparameters for RandomForestClassifier: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 30}
RandomForestClassifier Accuracy (Tuned): 0.7133757961783439
Best Hyperparameters for XGBClassifier: {'subsample': 0.9, 'max_depth': 4, 'learning_rate': 0.1}
XGBClassifier Accuracy (Tuned): 0.6878980891719745




Best Hyperparameters for MLPClassifier: {'hidden_layer_sizes': (256,), 'alpha': 0.001, 'activation': 'relu'}
MLPClassifier Accuracy (Tuned): 0.7133757961783439


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Hyperparameters for LogisticRegression: {'solver': 'lbfgs', 'C': 0.1}
LogisticRegression Accuracy (Tuned): 0.7261146496815286
Best Hyperparameters for KNeighborsClassifier: {'p': 2, 'n_neighbors': 7}
KNeighborsClassifier Accuracy (Tuned): 0.6878980891719745
