# **Pump-and-Dump Detection Model Notebook**

This notebook builds an XGBoost-based classifier for detecting pump-and-dump schemes. It includes data preparation, model training with hyperparameter tuning using GridSearchCV and SMOTE, evaluation, plotting, and model saving.


## 1. Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import joblib


## 2. Data Loading and Preparation

In [None]:
df = pd.read_csv('pump_dump_data.csv')
df = df.drop(columns=['date', 'pump_index', 'symbol'])
X = df.drop('gt', axis=1)
y = df['gt']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


## 3. Pipeline Definition

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('xgb', xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss'))
])

## 4. HyperParameter Tuning

In [None]:
param_grid = {
    'xgb__n_estimators': [100, 200, 300],
    'xgb__max_depth': [3, 5, 7],
    'xgb__learning_rate': [0.01, 0.1, 0.2],
    'xgb__subsample': [0.8, 1.0],
    'xgb__colsample_bytree': [0.8, 1.0]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation ROC AUC: ", grid_search.best_score_)

## 5. Model Evaluation

In [None]:
y_pred = grid_search.predict(X_test)
y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_prob)

print("Test ROC AUC: ", roc_auc)
print("Classification Report:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.savefig("confusion_matrix.png")
plt.show()

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label="ROC curve (area = {:.2f})".format(roc_auc))
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic")
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig("roc_curve.png")
plt.show()





## 6. Saving the Model

In [None]:
best_pipeline = grid_search.best_estimator_
scaler = best_pipeline.named_steps['scaler']
model = best_pipeline.named_steps['xgb']
joblib.dump(scaler, "scaler.joblib")
joblib.dump(model, "model.joblib")
    
print("Scaler and model saved as 'scaler.joblib' and 'model.joblib'.")
