# **Rug Pull detection model**

## 1. Loading the dataset

In [None]:
import pandas as pd

DATA_PATH = "rug_pull_dataset.csv"
df = pd.read_csv(DATA_PATH)
print(f"Data loaded successfully with shape: {df.shape}")
print(df.head())


## 2. Data preprocessing


### 2.1. Removing missing data 

In [None]:
drop_cols = [col for col in ["token_id", "creation_time"] if col in df.columns]
df.drop(columns=drop_cols, inplace=True)
target = "is_rug_pull"
feature_columns = [col for col in df.columns if col != target]

missing_vals = df[feature_columns].isnull().sum()
df.dropna(inplace=True)
print(f"After dropping missing values, new shape: {df.shape}")

X = df[feature_columns].copy()
y = df[target].copy()


### 2.2. Scaling features

In [None]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Data scaled successfully")


### 2.3. Using SMOTE to adress class imbalance in the dataset
The data labeled with "1" which means the data line contitutes a rug pull event represents only 15% of the dataset so we need to address that with SMOTE 

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)
print("SMOTE applied, here is the class distribution after resampling:")
print(pd.Series(y_resampled).value_counts())


### 2.4. Splitting the data with 80% for training and 20% for test

In [None]:
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)
print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")

## 3. Setting up the XGBoost Classifier

In [None]:
from sklearn.model_selection import  RepeatedStratifiedKFold, GridSearchCV
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(eval_metric='logloss', random_state=42)

# hyperparameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 250, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1],             
    'min_child_weight': [1, 3],  
}

# Using RepeatedStratifiedKFold for more robust estimates
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)

# Configuring GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=rskf,
    n_jobs=-1,
    verbose=1
)

# Model training using GridSearchCV
print("Starting hyperparameter tuning using GridSearchCV with repeated stratified folds...")
grid_search.fit(X_train, y_train)
print(f"Best hyperparameters: {grid_search.best_params_}")
print(f"Best ROC AUC score on training folds: {grid_search.best_score_}")

## 4. Model Evaluation

### 4.1. Evaluating the model on the testing set

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# Generating the classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n" + report)
print(report)

# Calculating the ROC AUC score
roc_auc = roc_auc_score(y_test, y_proba)
print(f"Test ROC AUC Score: {roc_auc:.4f}")


### 4.2. Ploting the confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

### 4.3. Ploting the ROC curve

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.4f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc='lower right')
plt.show()

### 4.4. Ploting feature importance

In [None]:
xgb.plot_importance(best_model, max_num_features=10, importance_type='gain')
plt.title("Top 10 Feature Importances")
plt.show()

## 5. Saving the trained model and the scaler on the Disk

In [None]:
import joblib

joblib.dump(best_model, 'xgb_rug_pull_model_improved.pkl')
joblib.dump(scaler, 'scaler_improved.pkl')
print("Model and scaler saved successfully.")