# Creating a Master Function for Data Science Project

In [44]:
# Import libraries
import pandas as pd
import numpy as np
from datasets import load_dataset
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go 
import warnings 
warnings.filterwarnings('ignore')


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import cross_val_score

In [45]:
# Read dataset
cc_df = pd.read_csv('creditcard.csv', nrows=10000)

In [53]:
# Define function name and variables
def build_and_evaluate_model(model_name, test_size, balancing_technique, scaling_technique, data):

# Data Preprocessing
    ## Split data
    X = data.drop('Class', axis=1) 
    y = data['Class']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=float(test_size), random_state=42)
 
    ## Balance data and Scaling data
    if balancing_technique.lower() == 'smote':
        oversampler = SMOTE(k_neighbors=5)
        X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)
    elif balancing_technique.lower() == 'nearmiss':
        undersampler = NearMiss()
        X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)
    else:
        raise ValueError('Invalid balance technique. Use smore or nearmiss.')
        
    if scaling_technique.lower() == 'standardscaler':
        scaler = StandardScaler()
        X_resampled[['Amount', 'Time']] = scaler.fit_transform(X_resampled[['Amount', 'Time']])
        X_test[['Amount', 'Time']] = scaler.fit_transform(X_test[['Amount', 'Time']])
    elif scaling_technique.lower() == 'minmaxscaler':
        scaler = MinMaxScaler()
        X_resampled[['Amount', 'Time']] = scaler.fit_transform(X_resampled[['Amount', 'Time']])
        X_test[['Amount', 'Time']] = scaler.fit_transform(X_test[['Amount', 'Time']])
    elif scaling_technique.lower() == 'robustscaler':
        scaler = RobustScaler()
        X_resampled[['Amount', 'Time']] = scaler.fit_transform(X_resampled[['Amount', 'Time']])
        X_test[['Amount', 'Time']] = scaler.fit_transform(X_test[['Amount', 'Time']])
    else:
        raise ValueError('Invalid scaling technique. Use standardscaler, minmaxscaler, or robustscaler.')
        
    # Build Models
    if model_name.lower() == 'rf':
        model = RandomForestClassifier(random_state=42)
    elif model_name.lower() == 'svm':
        model = SVC(random_state=42)
    elif model_name.lower() == 'xgb':
        model = XGBClassifier(random_state=42)
    else:
        raise ValueError("Invalid model name. Use rf for Random Forest, svm for Support Vector Machine, or xgb for XGBoost")
        
    # Cross Validation
    cv_scores = cross_val_score(model, X_resampled, y_resampled, cv=5 )

    # Fit Models
    model.fit(X_resampled, y_resampled)
    
    # Evaluate Models
    y_pred = model.predict(X_test)

    # Print Results
    print(f"Model: {model_name}")
    print(f"Balancing Technique: {balancing_technique}")
    print(f"Test Size: {test_size}")
    print(f"Scaling Technique: {scaling_technique}")
    print("-" * 50)
    print(f'Cross Validations Scores: {cv_scores}')
    print(f'Mean Cross Validations Scores: {np.mean(cv_scores)}')
    print("-" * 50)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("-" * 50)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [56]:
# Call Function
build_and_evaluate_model(model_name='rf', 
                         test_size='0.3', 
                         balancing_technique='smote', 
                         scaling_technique='standardscaler', 
                         data=cc_df)

Model: rf
Balancing Technique: smote
Test Size: 0.3
Scaling Technique: standardscaler
--------------------------------------------------
Cross Validations Scores: [1. 1. 1. 1. 1.]
Mean Cross Validations Scores: 1.0
--------------------------------------------------
Confusion Matrix:
[[2985    0]
 [   2   13]]
--------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2985
           1       1.00      0.87      0.93        15

    accuracy                           1.00      3000
   macro avg       1.00      0.93      0.96      3000
weighted avg       1.00      1.00      1.00      3000

