# XGBoost Model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# First dataset
train_df = pd.read_csv("processed_train.csv")
test_df = pd.read_csv("processed_test.csv")

# Second dataset
train_df1 = pd.read_csv("processed_train1.csv")
test_df1 = pd.read_csv("processed_test1.csv")

# Combined dataset
train_df2 = pd.read_csv("combined_processed_train.csv")
test_df2 = pd.read_csv("combined_processed_test.csv")

# Eval dataset
test_df3 = pd.read_csv("scam_dataset_eval_processed.csv")



# Convert text into numerical features using TF-IDF
vectorizer = TfidfVectorizer()

X_train1 = vectorizer.fit_transform(train_df1['text'])

# X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])


X_test1 = vectorizer.transform(test_df1['text'])

# X_train2 = vectorizer.fit_transform(train_df2['text'])
X_test2 = vectorizer.transform(test_df2['text'])

X_test3 = vectorizer.transform(test_df3['text'])



y_train = train_df['label']
y_test = test_df['label']

y_train1 = train_df1['label']
y_test1 = test_df1['label']

y_train2 = train_df2['label']
y_test2 = test_df2['label']

y_test3 = test_df3['label']

In [None]:
# # Train an SVM model
# svm_model = SVC()
# svm_model.fit(X_train_tfidf, train_df['label'])

# # Make predictions
# y_pred = svm_model.predict(X_test_tfidf)

# # Evaluate the model
# accuracy = accuracy_score(test_df['label'], y_pred)
# print(f'Accuracy: {accuracy:.2f}')
# print('Classification Report:')
# print(classification_report(test_df['label'], y_pred))

In [None]:
X_train1

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 36309 stored elements and shape (3200, 4049)>

In [None]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 113270 stored elements and shape (1280, 3205)>

In [None]:
import xgboost as xgb

# Train XGBoost model
model = xgb.XGBClassifier(eval_metric='logloss')
model.fit(X_train1, y_train1)

# Predict and evaluate
y_pred = model.predict(X_test3)
print("Accuracy:", accuracy_score(y_test3, y_pred))
print('Classification Report:')
print(classification_report(y_test3, y_pred))

Accuracy: 0.9594594594594594
Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        74
           1       1.00      0.92      0.96        74

    accuracy                           0.96       148
   macro avg       0.96      0.96      0.96       148
weighted avg       0.96      0.96      0.96       148



In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer


# Define objective function
def objective(trial):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "verbosity": 0,
        "booster": "gbtree",
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_categorical("learning_rate", [1e-2, 1e-3, 1e-4]),
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "lambda": trial.suggest_float("lambda", 1e-3, 10, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10, log=True),
    }

    model = xgb.XGBClassifier(**params, use_label_encoder=False)
    model.fit(X_train1, y_train1, eval_set=[(X_test1, y_test1)], verbose=False)
    preds = model.predict(X_test1)
    return accuracy_score(y_test1, preds)

# Run study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# Best result
print("Best trial:")
print(study.best_trial.params)

[I 2025-04-23 03:13:30,836] A new study created in memory with name: no-name-cc0c5160-cf42-47f4-8f54-c99126377e76
[I 2025-04-23 03:13:43,126] Trial 0 finished with value: 0.93125 and parameters: {'max_depth': 3, 'learning_rate': 0.0001, 'n_estimators': 95, 'subsample': 0.5563087480370892, 'colsample_bytree': 0.700920616837081, 'gamma': 0.11351597719353512, 'lambda': 0.01578407018891157, 'alpha': 0.019811284531855572}. Best is trial 0 with value: 0.93125.
[I 2025-04-23 03:13:56,112] Trial 1 finished with value: 0.9625 and parameters: {'max_depth': 7, 'learning_rate': 0.0001, 'n_estimators': 141, 'subsample': 0.9653861927047087, 'colsample_bytree': 0.5661610378030706, 'gamma': 2.2745840669727673, 'lambda': 0.1810908027164429, 'alpha': 0.4217654452881756}. Best is trial 1 with value: 0.9625.
[I 2025-04-23 03:14:13,062] Trial 2 finished with value: 0.96125 and parameters: {'max_depth': 7, 'learning_rate': 0.001, 'n_estimators': 195, 'subsample': 0.8924546737840724, 'colsample_bytree': 0.51

Best trial:
{'max_depth': 10, 'learning_rate': 0.01, 'n_estimators': 184, 'subsample': 0.8822221621816074, 'colsample_bytree': 0.5009989621026224, 'gamma': 1.5235015760732373, 'lambda': 0.02953267178303569, 'alpha': 0.054934964664353166}
