# **Rug Pull Detection with a Stacking Ensemble**


1. **Data Loading**  
2. **Data Preprocessing** (Imputation, Scaling, SMOTE)  
3. **Model Building** (Stacking Classifier)  
4. **Hyperparameter Tuning** (GridSearchCV)  
5. **Evaluation** (Classification Report, ROC Curve, Confusion Matrix)  
6. **Model Saving** (for production use)


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier

# Make plots look a bit nicer
plt.style.use('seaborn-whitegrid')


## 2. Data Loading


In [None]:
DATA_PATH = "rug_pull_dataset.csv"

df = pd.read_csv(DATA_PATH)

print(f"Data loaded successfully with shape: {df.shape}")
display(df.head())


## 3. Data Preprocessing


In [None]:

drop_cols = [col for col in ["token_id", "creation_time"] if col in df.columns]
df.drop(columns=drop_cols, inplace=True)

target = "is_rug_pull"
feature_columns = [col for col in df.columns if col != target]

X = df[feature_columns].copy()
y = df[target].copy()

# Impute missing values
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)
print("Missing values imputed.")

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
print("Features scaled.")

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)
print("SMOTE applied. Class distribution after resampling:")
print(y_resampled.value_counts())

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)
print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")


## 4. Model Building


In [None]:

xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=42)
rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression(max_iter=1000, random_state=42)

# Combined base models in a StackingClassifier
estimators = [
    ('xgb', xgb_model),
    ('rf', rf_model),
    ('lr', lr_model)
]

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    cv=5,
    n_jobs=-1
)


## 5. Hyperparameter Tuning


In [None]:

param_grid = {
    'final_estimator__C': [0.1, 1.0, 10.0],
    'xgb__n_estimators': [100, 250, 500],
    'xgb__max_depth': [3, 5, 7],
    'xgb__learning_rate': [0.01, 0.05, 0.1],
    'xgb__subsample': [0.8, 1.0],
    'xgb__colsample_bytree': [0.8, 1.0],
    'xgb__gamma': [0, 1],
    'xgb__min_child_weight': [1, 3]
}

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)

grid_search = GridSearchCV(
    estimator=stacking_clf,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=rskf,
    n_jobs=-1,
    verbose=1
)

print("Starting hyperparameter tuning for the stacking classifier...")
grid_search.fit(X_train, y_train)
print(f"Best hyperparameters: {grid_search.best_params_}")
print(f"Best ROC AUC score on training folds: {grid_search.best_score_}")


## 6. Evaluation


In [None]:
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

roc_auc = roc_auc_score(y_test, y_proba)
print(f"Test ROC AUC Score: {roc_auc:.4f}")

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.4f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc='lower right')
plt.show()


## 7. Model Saving


In [None]:
joblib.dump(best_model, 'stacked_rug_pull_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("Ensemble model and scaler saved successfully.")
