<a href="https://colab.research.google.com/github/moaaz12-web/Data-Science-topics/blob/main/Regret_detection_and_regret_domain_identification_using_ML_algorithms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df_domain = pd.read_excel('/content/drive/MyDrive/regret/Testing_Regret_domain.xlsx')
df_label = pd.read_excel('/content/Testing_Regret_data_with_reddit12.xlsx')
df_data = pd.read_excel('/content/drive/MyDrive/regret/Testing_Regret_data.xlsx')

In [None]:
! pip install xgboost



# using ml models for detecting the domain only using tfidf

In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from warnings import simplefilter

# Suppress the warning for simplicity
# simplefilter("ignore", category=UndefinedMetricWarning)

# Step 2: Preprocessing (Remove NaN values)
df_domain.dropna(inplace=True)

# Step 3: Text feature extraction and representation techniques
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df_domain['Hindi'])

# Encode the class labels into integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_domain['Domain'])

# Step 4: Divide data into K folds (e.g., 5-fold cross-validation)
kf = StratifiedKFold(n_splits=3)

# Step 5: Initialize a list of machine learning models
models = [
    ('Multinomial Naive Bayes', MultinomialNB()),
    ('Support Vector Machine', SVC()),
    ('Random Forest', RandomForestClassifier()),
    ('Logistic Regression', LogisticRegression()),
    ('XGBoost', XGBClassifier(
        n_estimators=100,  # Number of trees
        max_depth=3,       # Maximum tree depth
        learning_rate=0.1, # Learning rate
        subsample=0.8,     # Fraction of samples used for tree building
        colsample_bytree=0.8  # Fraction of features used for tree building
    )),
    ('AdaBoost', AdaBoostClassifier())
]

# Initialize lists to store evaluation metrics
for model_name, model in models:
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the model
        model.fit(X_train, y_train)

        # Generate confusion matrix
        y_pred = model.predict(X_test)

        # Accuracy of the model
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_scores.append(accuracy)

        # Calculate precision, recall, and F1 score
        # Calculate precision, recall, and F1 score
        precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
        recall = recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
        f1 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))

        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # Calculate and print the mean evaluation metrics for each model
    mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
    mean_precision = sum(precision_scores) / len(precision_scores)
    mean_recall = sum(recall_scores) / len(recall_scores)
    mean_f1 = sum(f1_scores) / len(f1_scores)

    print(f"Model: {model_name}")
    print("Mean Accuracy:", mean_accuracy)
    print("Mean Precision:", mean_precision)
    print("Mean Recall:", mean_recall)
    print("Mean F1 Score:", mean_f1)
    print("\n")


Model: Multinomial Naive Bayes
Mean Accuracy: 0.3124693176239568
Mean Precision: 0.44002161762102115
Mean Recall: 0.3892966360856269
Mean F1 Score: 0.3135848539152429


Model: Support Vector Machine
Mean Accuracy: 0.36794305351006384
Mean Precision: 0.3909947118437814
Mean Recall: 0.36794305351006384
Mean F1 Score: 0.33800162802207484


Model: Random Forest
Mean Accuracy: 0.33038782523318605
Mean Precision: 0.3388123764590718
Mean Recall: 0.33038782523318605
Mean F1 Score: 0.29860318919317946


Model: Logistic Regression
Mean Accuracy: 0.37358861070201277
Mean Precision: 0.3708398830425774
Mean Recall: 0.37358861070201277
Mean F1 Score: 0.3453501090621169


Model: XGBoost
Mean Accuracy: 0.3321060382916053
Mean Precision: 0.3222844383395065
Mean Recall: 0.3321060382916053
Mean F1 Score: 0.3074411830176596


Model: AdaBoost
Mean Accuracy: 0.28252331860579283
Mean Precision: 0.31004852126929255
Mean Recall: 0.28252331860579283
Mean F1 Score: 0.26727894401729607




# using ml models for predicting domain using bag of words + ngram

In [26]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier  # Make sure you have XGBoost installed
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Step 2: Preprocessing (Remove NaN values)
df_domain.dropna(inplace=True)

# Step 3: Text feature extraction and representation techniques using n-grams with Tfidf
count_vectorizer = CountVectorizer(ngram_range=(1, 2))  # You can specify the n-gram range here
X = count_vectorizer.fit_transform(df_domain['Hindi'])

# Encode the class labels into integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_domain['Domain'])

kf = StratifiedKFold(n_splits=3)

# Step 5: Initialize a list of machine learning models
models = [
    ('Multinomial Naive Bayes', MultinomialNB()),
    ('Support Vector Machine', SVC()),
    ('Random Forest', RandomForestClassifier()),
    ('Logistic Regression', LogisticRegression(max_iter=1200)),
    ('XGBoost', XGBClassifier(
        n_estimators=100,  # Number of trees
        max_depth=3,       # Maximum tree depth
        learning_rate=0.1, # Learning rate
        subsample=0.8,     # Fraction of samples used for tree building
        colsample_bytree=0.8  # Fraction of features used for tree building
    )),
    ('AdaBoost', AdaBoostClassifier())
]

# Initialize lists to store evaluation metrics
for model_name, model in models:
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the model
        model.fit(X_train, y_train)

        # Generate confusion matrix
        y_pred = model.predict(X_test)

        # Accuracy of the model
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_scores.append(accuracy)

        # Calculate precision, recall, and F1 score
        # Calculate precision, recall, and F1 score
        precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
        recall = recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
        f1 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))

        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # Calculate and print the mean evaluation metrics for each model
    mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
    mean_precision = sum(precision_scores) / len(precision_scores)
    mean_recall = sum(recall_scores) / len(recall_scores)
    mean_f1 = sum(f1_scores) / len(f1_scores)

    print(f"Model: {model_name}")
    print("Mean Accuracy:", mean_accuracy)
    print("Mean Precision:", mean_precision)
    print("Mean Recall:", mean_recall)
    print("Mean F1 Score:", mean_f1)
    print("\n")

Model: Multinomial Naive Bayes
Mean Accuracy: 0.32179675994108986
Mean Precision: 0.4153427361018864
Mean Recall: 0.3583109675267649
Mean F1 Score: 0.287996624442218


Model: Support Vector Machine
Mean Accuracy: 0.2960235640648012
Mean Precision: 0.37466954012418974
Mean Recall: 0.3242542850984402
Mean F1 Score: 0.27926427563605405


Model: Random Forest
Mean Accuracy: 0.3325969563082965
Mean Precision: 0.3560837113970496
Mean Recall: 0.3325969563082965
Mean F1 Score: 0.3001475295436605


Model: Logistic Regression
Mean Accuracy: 0.34756995581737843
Mean Precision: 0.35469034705696023
Mean Recall: 0.34756995581737843
Mean F1 Score: 0.3328403388679524


Model: XGBoost
Mean Accuracy: 0.3495336278841434
Mean Precision: 0.35206584153156495
Mean Recall: 0.3495336278841434
Mean F1 Score: 0.32689360746444013


Model: AdaBoost
Mean Accuracy: 0.2911143838978891
Mean Precision: 0.3188452768436723
Mean Recall: 0.3010776181933351
Mean F1 Score: 0.28076919201210154




# using ml models for predicting domain only using tfidf + ngram

In [27]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier  # Make sure you have XGBoost installed
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Step 2: Preprocessing (Remove NaN values)
df_domain.dropna(inplace=True)

# Step 3: Text feature extraction and representation techniques using n-grams with Tfidf
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # You can specify the n-gram range here
X = tfidf_vectorizer.fit_transform(df_domain['Hindi'])

# Encode the class labels into integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_domain['Domain'])

kf = StratifiedKFold(n_splits=3)

# Step 5: Initialize a list of machine learning models
models = [
    ('Multinomial Naive Bayes', MultinomialNB()),
    ('Support Vector Machine', SVC()),
    ('Random Forest', RandomForestClassifier()),
    ('Logistic Regression', LogisticRegression(max_iter=1200)),
    ('XGBoost', XGBClassifier(
        n_estimators=100,  # Number of trees
        max_depth=3,       # Maximum tree depth
        learning_rate=0.1, # Learning rate
        subsample=0.8,     # Fraction of samples used for tree building
        colsample_bytree=0.8  # Fraction of features used for tree building
    )),
    ('AdaBoost', AdaBoostClassifier())
]

# Initialize lists to store evaluation metrics
for model_name, model in models:
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the model
        model.fit(X_train, y_train)

        # Generate confusion matrix
        y_pred = model.predict(X_test)

        # Accuracy of the model
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_scores.append(accuracy)

        # Calculate precision, recall, and F1 score
        # Calculate precision, recall, and F1 score
        precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
        recall = recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
        f1 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))

        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # Calculate and print the mean evaluation metrics for each model
    mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
    mean_precision = sum(precision_scores) / len(precision_scores)
    mean_recall = sum(recall_scores) / len(recall_scores)
    mean_f1 = sum(f1_scores) / len(f1_scores)

    print(f"Model: {model_name}")
    print("Mean Accuracy:", mean_accuracy)
    print("Mean Precision:", mean_precision)
    print("Mean Recall:", mean_recall)
    print("Mean F1 Score:", mean_f1)
    print("\n")

Model: Multinomial Naive Bayes
Mean Accuracy: 0.27810505645557193
Mean Precision: 0.4508276811509451
Mean Recall: 0.377478905486975
Mean F1 Score: 0.27148462881899565


Model: Support Vector Machine
Mean Accuracy: 0.3505154639175258
Mean Precision: 0.45516554171927986
Mean Recall: 0.3718764819421749
Mean F1 Score: 0.325065197492371


Model: Random Forest
Mean Accuracy: 0.3335787923416789
Mean Precision: 0.359335152041314
Mean Recall: 0.3335787923416789
Mean F1 Score: 0.2979203719974018


Model: Logistic Regression
Mean Accuracy: 0.3738340697103584
Mean Precision: 0.4092638858822564
Mean Recall: 0.37972205086881866
Mean F1 Score: 0.33999956787698377


Model: XGBoost
Mean Accuracy: 0.3335787923416789
Mean Precision: 0.32811628698535755
Mean Recall: 0.3335787923416789
Mean F1 Score: 0.3086798374925735


Model: AdaBoost
Mean Accuracy: 0.2707412862052037
Mean Precision: 0.2888219115953444
Mean Recall: 0.2707412862052037
Mean F1 Score: 0.24966774988472795




# using ml models for detecting the regret type using tfidf

In [None]:
from sklearn.model_selection import GridSearchCV
# HYPER PARAMETERS DEFINITION

# Define hyperparameters for each model
param_grid_nb = {'alpha': [0.1, 0.5, 1.0]}
param_grid_svc = {'C': [10, 100]}
param_grid_rf = {'n_estimators': [ 100, 200], 'max_depth': [10, 20]}
param_grid_lr = {'C': [0.01, 0.1, 1, 10]}
param_grid_xgb = {'n_estimators': [100, 200], 'max_depth': [5, 7], 'learning_rate': [0.05, 0.1]}
param_grid_ada = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1]}

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier  # Make sure you have XGBoost installed
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Step 2: Preprocessing (Remove NaN values)
df_label.dropna(inplace=True)

# Step 3: Text feature extraction and representation techniques
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df_label['Hindi'])

# Encode the class labels into integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_label['Regret'])

# Step 4: Divide data into K folds (e.g., 5-fold cross-validation)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics
# Updated list of models with hyperparameters
models = [
    ('Multinomial Naive Bayes', GridSearchCV(MultinomialNB(), param_grid_nb, cv=3)),
    ('Random Forest', GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=3)),
    ('Support Vector Machine', GridSearchCV(SVC(), param_grid_svc, cv=3)),
    ('Logistic Regression', GridSearchCV(LogisticRegression(), param_grid_lr, cv=3)),
    ('XGBoost', GridSearchCV(XGBClassifier(), param_grid_xgb, cv=3)),
    ('AdaBoost', GridSearchCV(AdaBoostClassifier(), param_grid_ada, cv=3))
]

# Initialize lists to store evaluation metrics
for model_name, model in models:
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the model with hyperparameter tuning
        model.fit(X_train, y_train)

        # Generate confusion matrix
        y_pred = model.predict(X_test)

        # Accuracy of the model
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_scores.append(accuracy)

        # Calculate precision, recall, and F1 score
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # Calculate and print the mean evaluation metrics for each model
    mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
    mean_precision = sum(precision_scores) / len(precision_scores)
    mean_recall = sum(recall_scores) / len(recall_scores)
    mean_f1 = sum(f1_scores) / len(f1_scores)

    print(f"Model: {model_name}")
    print("Mean Accuracy:", mean_accuracy)
    print("Mean Precision:", mean_precision)
    print("Mean Recall:", mean_recall)
    print("Mean F1 Score:", mean_f1)
    print("Best Hyperparameters:", model.best_params_)
    print("\n")

Model: Multinomial Naive Bayes
Mean Accuracy: 0.6317755039972194
Mean Precision: 0.6490687988701637
Mean Recall: 0.49409190371991246
Mean F1 Score: 0.561001122030949
Best Hyperparameters: {'alpha': 1.0}


Model: Random Forest
Mean Accuracy: 0.659929614181439
Mean Precision: 0.6848545951450625
Mean Recall: 0.5312910284463894
Mean F1 Score: 0.5981282054601428
Best Hyperparameters: {'max_depth': 20, 'n_estimators': 200}


Model: Support Vector Machine
Mean Accuracy: 0.6528382429614181
Mean Precision: 0.6440719599399942
Mean Recall: 0.6065645514223195
Mean F1 Score: 0.6246534197479063
Best Hyperparameters: {'C': 100}


Model: Logistic Regression
Mean Accuracy: 0.6497075947167188
Mean Precision: 0.6453316735042005
Mean Recall: 0.5886214442013129
Mean F1 Score: 0.6156038995573005
Best Hyperparameters: {'C': 1}


Model: XGBoost
Mean Accuracy: 0.6549250521376433
Mean Precision: 0.6533381663755409
Mean Recall: 0.5881838074398249
Mean F1 Score: 0.618935826483159
Best Hyperparameters: {'learning_

# using ml models for detecting the regret type using tfidf + ngram

In [None]:
del models

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier  # Make sure you have XGBoost installed
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Step 2: Preprocessing (Remove NaN values)
df_label.dropna(inplace=True)

# Step 3: Text feature extraction and representation techniques using n-grams with Tfidf
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # You can specify the n-gram range here
X = tfidf_vectorizer.fit_transform(df_label['Hindi'])


# Encode the class labels into integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_label['Regret'])

# Step 4: Divide data into K folds (e.g., 5-fold cross-validation)
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics
# Updated list of models with hyperparameters
models = [
    ('Multinomial Naive Bayes', GridSearchCV(MultinomialNB(), param_grid_nb, cv=3)),
    ('Random Forest', GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=3)),
    ('Support Vector Machine', GridSearchCV(SVC(), param_grid_svc, cv=3)),
    ('Logistic Regression', GridSearchCV(LogisticRegression(max_iter=1100), param_grid_lr, cv=3)),
    # ('XGBoost', GridSearchCV(XGBClassifier(), param_grid_xgb, cv=3)),
    # ('AdaBoost', GridSearchCV(AdaBoostClassifier(), param_grid_ada, cv=3))
]

# Initialize lists to store evaluation metrics
for model_name, model in models:
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the model with hyperparameter tuning
        model.fit(X_train, y_train)

        # Generate confusion matrix
        y_pred = model.predict(X_test)

        # Accuracy of the model
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_scores.append(accuracy)

        # Calculate precision, recall, and F1 score
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # Calculate and print the mean evaluation metrics for each model
    mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
    mean_precision = sum(precision_scores) / len(precision_scores)
    mean_recall = sum(recall_scores) / len(recall_scores)
    mean_f1 = sum(f1_scores) / len(f1_scores)

    print(f"Model: {model_name}")
    print("Mean Accuracy:", mean_accuracy)
    print("Mean Precision:", mean_precision)
    print("Mean Recall:", mean_recall)
    print("Mean F1 Score:", mean_f1)
    print("Best Hyperparameters:", model.best_params_)
    print("\n")


Model: Multinomial Naive Bayes
Mean Accuracy: 0.6461619603199017
Mean Precision: 0.692341824880499
Mean Recall: 0.46565393189189064
Mean F1 Score: 0.5558071157845837
Best Hyperparameters: {'alpha': 0.5}


Model: Random Forest
Mean Accuracy: 0.6211428033217464
Mean Precision: 0.7218042825471144
Mean Recall: 0.3352199240535143
Mean F1 Score: 0.4572730726064059
Best Hyperparameters: {'max_depth': 20, 'n_estimators': 200}


Model: Support Vector Machine
Mean Accuracy: 0.665135802701052
Mean Precision: 0.6690240588652929
Mean Recall: 0.5881725362516282
Mean F1 Score: 0.6258482402331572
Best Hyperparameters: {'C': 10}


Model: Logistic Regression
Mean Accuracy: 0.6545012879607953
Mean Precision: 0.6612636737217518
Mean Recall: 0.5636583534811105
Mean F1 Score: 0.6081422504097751
Best Hyperparameters: {'C': 1}




In [None]:
# Define hyperparameters for XGBoost
param_grid_xgb = {'n_estimators': [100, 200],  'learning_rate': [0.01, 0.1]}

# Initialize XGBoost model
xgb_model = XGBClassifier()

# Initialize lists to store evaluation metrics
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the XGBoost model with hyperparameter tuning
    xgb_grid = GridSearchCV(xgb_model, param_grid_xgb, cv=3)
    xgb_grid.fit(X_train, y_train)

    # Generate confusion matrix
    y_pred = xgb_grid.predict(X_test)

    # Accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)

    # Calculate precision, recall, and F1 score
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Calculate and print the mean evaluation metrics for XGBoost model
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
mean_precision = sum(precision_scores) / len(precision_scores)
mean_recall = sum(recall_scores) / len(recall_scores)
mean_f1 = sum(f1_scores) / len(f1_scores)

print("XGBoost Model:")
print("Mean Accuracy:", mean_accuracy)
print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1 Score:", mean_f1)
print("Best Hyperparameters:", xgb_grid.best_params_)


XGBoost Model:
Mean Accuracy: 0.6555439974347755
Mean Precision: 0.6524239757561936
Mean Recall: 0.5938610498917596
Mean F1 Score: 0.6214741090130529
Best Hyperparameters: {'learning_rate': 0.1, 'n_estimators': 100}


In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Define hyperparameters for AdaBoost
param_grid_ada = {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]}

# Initialize AdaBoost model
ada_model = AdaBoostClassifier()

# Initialize lists to store evaluation metrics
accuracy_scores_ada = []
precision_scores_ada = []
recall_scores_ada = []
f1_scores_ada = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the AdaBoost model with hyperparameter tuning
    ada_grid = GridSearchCV(ada_model, param_grid_ada, cv=3)
    ada_grid.fit(X_train, y_train)

    # Generate confusion matrix
    y_pred_ada = ada_grid.predict(X_test)

    # Accuracy of the model
    accuracy_ada = accuracy_score(y_test, y_pred_ada)
    accuracy_scores_ada.append(accuracy_ada)

    # Calculate precision, recall, and F1 score
    precision_ada = precision_score(y_test, y_pred_ada)
    recall_ada = recall_score(y_test, y_pred_ada)
    f1_ada = f1_score(y_test, y_pred_ada)

    precision_scores_ada.append(precision_ada)
    recall_scores_ada.append(recall_ada)
    f1_scores_ada.append(f1_ada)

# Calculate and print the mean evaluation metrics for AdaBoost model
mean_accuracy_ada = sum(accuracy_scores_ada) / len(accuracy_scores_ada)
mean_precision_ada = sum(precision_scores_ada) / len(precision_scores_ada)
mean_recall_ada = sum(recall_scores_ada) / len(recall_scores_ada)
mean_f1_ada = sum(f1_scores_ada) / len(f1_scores_ada)

print("AdaBoost Model:")
print("Mean Accuracy:", mean_accuracy_ada)
print("Mean Precision:", mean_precision_ada)
print("Mean Recall:", mean_recall_ada)
print("Mean F1 Score:", mean_f1_ada)
print("Best Hyperparameters:", ada_grid.best_params_)


AdaBoost Model:
Mean Accuracy: 0.6175997044460673
Mean Precision: 0.6092268380972398
Mean Recall: 0.5531619651354356
Mean F1 Score: 0.5794311612134995
Best Hyperparameters: {'learning_rate': 0.1, 'n_estimators': 50}


# using ml models for detecting the regret type using bag of words + ngram

In [None]:
del models

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier  # Make sure you have XGBoost installed
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Step 2: Preprocessing (Remove NaN values)
df_label.dropna(inplace=True)

# Step 3: Text feature extraction and representation techniques using n-grams with Tfidf
count_vectorizer = CountVectorizer(ngram_range=(1, 2))  # You can specify the n-gram range here
X = count_vectorizer.fit_transform(df_label['Hindi'])


# Encode the class labels into integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_label['Regret'])

# Step 4: Divide data into K folds (e.g., 5-fold cross-validation)
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics
# Updated list of models with hyperparameters
models = [
    ('Multinomial Naive Bayes', GridSearchCV(MultinomialNB(), param_grid_nb, cv=3)),
    ('Random Forest', GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=3)),
    ('Support Vector Machine', GridSearchCV(SVC(), param_grid_svc, cv=3)),
    ('Logistic Regression', GridSearchCV(LogisticRegression(), param_grid_lr, cv=3)),
    # ('XGBoost', GridSearchCV(XGBClassifier(), param_grid_xgb, cv=3)),
    # ('AdaBoost', GridSearchCV(AdaBoostClassifier(), param_grid_ada, cv=3))
]

# Initialize lists to store evaluation metrics
for model_name, model in models:
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the model with hyperparameter tuning
        model.fit(X_train, y_train)

        # Generate confusion matrix
        y_pred = model.predict(X_test)

        # Accuracy of the model
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_scores.append(accuracy)

        # Calculate precision, recall, and F1 score
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # Calculate and print the mean evaluation metrics for each model
    mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
    mean_precision = sum(precision_scores) / len(precision_scores)
    mean_recall = sum(recall_scores) / len(recall_scores)
    mean_f1 = sum(f1_scores) / len(f1_scores)

    print(f"Model: {model_name}")
    print("Mean Accuracy:", mean_accuracy)
    print("Mean Precision:", mean_precision)
    print("Mean Recall:", mean_recall)
    print("Mean F1 Score:", mean_f1)
    print("Best Hyperparameters:", model.best_params_)
    print("\n")


Model: Multinomial Naive Bayes
Mean Accuracy: 0.6378226326790081
Mean Precision: 0.6168538189341004
Mean Recall: 0.6328229996217621
Mean F1 Score: 0.6246991519353613
Best Hyperparameters: {'alpha': 1.0}


Model: Random Forest
Mean Accuracy: 0.6071720878949427
Mean Precision: 0.6967544426838878
Mean Recall: 0.3142122017927785
Mean F1 Score: 0.43240033598242555
Best Hyperparameters: {'max_depth': 20, 'n_estimators': 200}


Model: Support Vector Machine
Mean Accuracy: 0.6472070179448305
Mean Precision: 0.6381405004819336
Mean Recall: 0.6017557595050947
Mean F1 Score: 0.6190562145055516
Best Hyperparameters: {'C': 10}




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model: Logistic Regression
Mean Accuracy: 0.6524170430883091
Mean Precision: 0.6452875071766568
Mean Recall: 0.6004273283185201
Mean F1 Score: 0.621944980703275
Best Hyperparameters: {'C': 0.1}




In [None]:
# Define hyperparameters for XGBoost
param_grid_xgb = {'n_estimators': [100, 200],  'learning_rate': [0.01, 0.1]}

# Initialize XGBoost model
xgb_model = XGBClassifier()

# Initialize lists to store evaluation metrics
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the XGBoost model with hyperparameter tuning
    xgb_grid = GridSearchCV(xgb_model, param_grid_xgb, cv=3)
    xgb_grid.fit(X_train, y_train)

    # Generate confusion matrix
    y_pred = xgb_grid.predict(X_test)

    # Accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)

    # Calculate precision, recall, and F1 score
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Calculate and print the mean evaluation metrics for XGBoost model
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
mean_precision = sum(precision_scores) / len(precision_scores)
mean_recall = sum(recall_scores) / len(recall_scores)
mean_f1 = sum(f1_scores) / len(f1_scores)

print("XGBoost Model:")
print("Mean Accuracy:", mean_accuracy)
print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1 Score:", mean_f1)
print("Best Hyperparameters:", xgb_grid.best_params_)


XGBoost Model:
Mean Accuracy: 0.6453279753746801
Mean Precision: 0.6358496442201127
Mean Recall: 0.5986706490860784
Mean F1 Score: 0.6163191205739308
Best Hyperparameters: {'learning_rate': 0.1, 'n_estimators': 100}


In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Define hyperparameters for AdaBoost
param_grid_ada = {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]}

# Initialize AdaBoost model
ada_model = AdaBoostClassifier()

# Initialize lists to store evaluation metrics
accuracy_scores_ada = []
precision_scores_ada = []
recall_scores_ada = []
f1_scores_ada = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the AdaBoost model with hyperparameter tuning
    ada_grid = GridSearchCV(ada_model, param_grid_ada, cv=3)
    ada_grid.fit(X_train, y_train)

    # Generate confusion matrix
    y_pred_ada = ada_grid.predict(X_test)

    # Accuracy of the model
    accuracy_ada = accuracy_score(y_test, y_pred_ada)
    accuracy_scores_ada.append(accuracy_ada)

    # Calculate precision, recall, and F1 score
    precision_ada = precision_score(y_test, y_pred_ada)
    recall_ada = recall_score(y_test, y_pred_ada)
    f1_ada = f1_score(y_test, y_pred_ada)

    precision_scores_ada.append(precision_ada)
    recall_scores_ada.append(recall_ada)
    f1_scores_ada.append(f1_ada)

# Calculate and print the mean evaluation metrics for AdaBoost model
mean_accuracy_ada = sum(accuracy_scores_ada) / len(accuracy_scores_ada)
mean_precision_ada = sum(precision_scores_ada) / len(precision_scores_ada)
mean_recall_ada = sum(recall_scores_ada) / len(recall_scores_ada)
mean_f1_ada = sum(f1_scores_ada) / len(f1_scores_ada)

print("AdaBoost Model:")
print("Mean Accuracy:", mean_accuracy_ada)
print("Mean Precision:", mean_precision_ada)
print("Mean Recall:", mean_recall_ada)
print("Mean F1 Score:", mean_f1_ada)
print("Best Hyperparameters:", ada_grid.best_params_)


AdaBoost Model:
Mean Accuracy: 0.6092615508806479
Mean Precision: 0.580888884178001
Mean Recall: 0.6691246380010646
Mean F1 Score: 0.6197566990984554
Best Hyperparameters: {'learning_rate': 0.1, 'n_estimators': 100}
