In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

fraud_data = pd.read_csv('C:/Users/nejat/AIM Projects/week8 data/Fraud_Data.csv')
ip_data = pd.read_csv('C:/Users/nejat/AIM Projects/week8 data/IpAddress_to_Country.csv')

fraud_data_clean = pd.read_csv('C:/Users/nejat/AIM Projects/week8 data/Fraud_Data_cleaned.csv')
ip_clean = pd.read_csv('C:/Users/nejat/AIM Projects/week8 data/ip_cleaned.csv')


In [13]:
X_fraud = fraud_data_clean.drop(columns=['class'])
y_fraud = fraud_data_clean['class']

X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.3, random_state=42)



In [12]:
print(ip_clean.columns)


Index(['lower_bound_ip_address', 'upper_bound_ip_address', 'country'], dtype='object')


In [15]:
def train_and_evaluate(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc}")
    print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}")
    print(f"Classification Report: \n{classification_report(y_test, y_pred)}")
    
    return acc

In [21]:
def preprocess_data(X_train, X_test):
    date_cols = [col for col in X_train.columns if 'date' in col.lower() or pd.api.types.is_datetime64_any_dtype(X_train[col])]
    for col in date_cols:
        X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
        X_test[col] = pd.to_datetime(X_test[col], errors='coerce')

        X_train[f'{col}_year'] = X_train[col].dt.year
        X_train[f'{col}_month'] = X_train[col].dt.month
        X_test[f'{col}_year'] = X_test[col].dt.year
        X_test[f'{col}_month'] = X_test[col].dt.month

        X_train.drop(columns=[col], inplace=True)
        X_test.drop(columns=[col], inplace=True)

    for col in X_train.select_dtypes(include=['object']).columns:
        X_train[col], uniques = pd.factorize(X_train[col])
        X_test[col] = pd.Categorical(X_test[col], categories=uniques).codes


    X_train.fillna(0, inplace=True)
    X_test.fillna(0, inplace=True)

    X_train = X_train.apply(pd.to_numeric, errors='coerce')
    X_test = X_test.apply(pd.to_numeric, errors='coerce')

    return X_train, X_test


X_fraud_train, X_fraud_test = preprocess_data(X_fraud_train, X_fraud_test)


In [22]:
X_fraud_train, X_fraud_test = preprocess_data(X_fraud_train, X_fraud_test)

for name, model in models:
    print(f"Training and evaluating {name}")
    train_and_evaluate(model, X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test)


Training and evaluating Logistic Regression
Accuracy: 0.9069572506286673
Confusion Matrix: 
[[41116     1]
 [ 4217     0]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     41117
           1       0.00      0.00      0.00      4217

    accuracy                           0.91     45334
   macro avg       0.45      0.50      0.48     45334
weighted avg       0.82      0.91      0.86     45334

Training and evaluating Decision Tree
Accuracy: 0.0944986103145542
Confusion Matrix: 
[[   73 41044]
 [    6  4211]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.92      0.00      0.00     41117
           1       0.09      1.00      0.17      4217

    accuracy                           0.09     45334
   macro avg       0.51      0.50      0.09     45334
weighted avg       0.85      0.09      0.02     45334

Training and evaluating Random Forest
Accuracy: 0.32642167

In [30]:
!pip install mlflow --timeout=100



Collecting mlflow
  Using cached mlflow-2.17.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.17.0 (from mlflow)
  Using cached mlflow_skinny-2.17.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Using cached alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Using cached docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Using cached graphene-3.4-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting waitress<4 (from mlflow)
  Using cached waitress-3.0.0-py3-none-any.whl.metadata (4.2 kB)
Collecting cachetools<6,>=5.0.0 (from mlflow-skinny==2.17.0->mlflow)
  Using cached cachetools-5.5.0-py3-none-any.whl.metadata (5.3 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.17.0->mlflow)
  Using cached databricks_sdk-0.35.0-py3-none-any.whl.metadata (38 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==2.17.0->mlflow)
  Using cached opentelemetr

In [31]:
import mlflow
import mlflow.sklearn

def mlflow_tracking(model_name, model, X_train, y_train, X_test, y_test):
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        
        # Log parameters, metrics, and model
        mlflow.log_param('Model', model_name)
        mlflow.log_metric('Accuracy', acc)
        mlflow.sklearn.log_model(model, model_name)
        
        print(f"Model {model_name} logged with accuracy: {acc}")


In [32]:
mlflow_tracking('Logistic Regression', LogisticRegression(max_iter=1000), X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test)
mlflow_tracking('Random Forest', RandomForestClassifier(n_estimators=100), X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test)




Model Logistic Regression logged with accuracy: 0.9069572506286673




Model Random Forest logged with accuracy: 0.4509639564124057
