In [1]:
#Task 3 - Feature Engineering
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load your dataset
df = pd.read_csv('../Data/data.csv')

In [4]:
#Task 3 - Feature Engineering

# 1. Create Aggregate Features
df['Total_Transaction_Amount'] = df.groupby('CustomerId')['Amount'].transform('sum')
df['Average_Transaction_Amount'] = df.groupby('CustomerId')['Amount'].transform('mean')
df['Transaction_Count'] = df.groupby('CustomerId')['TransactionId'].transform('count')
df['Std_Dev_Transaction_Amount'] = df.groupby('CustomerId')['Amount'].transform('std')

# 2. Extract Features from TransactionStartTime
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])
df['Transaction_Hour'] = df['TransactionStartTime'].dt.hour
df['Transaction_Day'] = df['TransactionStartTime'].dt.day
df['Transaction_Month'] = df['TransactionStartTime'].dt.month
df['Transaction_Year'] = df['TransactionStartTime'].dt.year

# 3. Encode Categorical Variables
# Check if these columns exist in your DataFrame
categorical_features_list = ['ProductCategory', 'CurrencyCode', 'FraudResult', 'TransactionId'] 
# Only include 'MerchantName' if it's in the DataFrame's columns and hasn't been one-hot encoded yet
if 'MerchantName' in df.columns and 'MerchantName_1' not in df.columns:  # Check for one-hot encoded version
    categorical_features_list.append('MerchantName')

# Create a copy of the DataFrame for categorical features BEFORE one-hot encoding
categorical_features = df[categorical_features_list].copy() if all(col in df.columns for col in categorical_features_list) else pd.DataFrame()

# Now apply one-hot encoding to the original DataFrame, but only if the columns haven't been encoded already
columns_to_encode = [col for col in categorical_features.drop('TransactionId', axis=1, errors='ignore').columns if col in df.columns and f'{col}_1' not in df.columns]
df = pd.get_dummies(df, columns=columns_to_encode, drop_first=True)

# 4. Handle Missing Values
# Impute numerical columns with mean, EXCLUDING non-numeric columns
numeric_columns = df.select_dtypes(include=np.number).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())  
# Alternatively, drop if few
df.dropna(inplace=True)

# 5. Normalize/Standardize Numerical Features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# Redefine numerical_features to include only the current numerical columns
numerical_features = df.select_dtypes(include=np.number).columns  
# Exclude the one-hot encoded and other engineered features
numerical_features = [f for f in numerical_features if f in ['CountryCode', 'Amount', 'Value']]  
# Now apply scaling 
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [3]:
def calculate_rfms(df):
    rfm = df.groupby('CustomerId').agg({
        'Amount': ['sum', 'count'],  # Total Revenue, Count of Orders
        'TransactionStartTime': lambda x: (pd.Timestamp.now().tz_localize(None) - x.max().tz_localize(None)).days  # Recency as days since last transaction, ensuring both are tz-naive
    })

    rfm.columns = ['Total_Spent', 'Total_Transactions', 'Recency']
    # Score assignment could be expanded based on business logic
    rfm['RFMS_Score'] = (rfm['Recency'] + rfm['Total_Spent'] + rfm['Total_Transactions']).mean()
    return rfm

rfms = calculate_rfms(df)
print(rfms)

                 Total_Spent  Total_Transactions  Recency   RFMS_Score
CustomerId                                                            
CustomerId_1       -0.135580                   1     2260  2232.578568
CustomerId_10      -0.135580                   1     2260  2232.578568
CustomerId_1001    -0.110207                   5     2265  2232.578568
CustomerId_1002    -0.565027                  11     2202  2232.578568
CustomerId_1003    -0.164688                   6     2188  2232.578568
...                      ...                 ...      ...          ...
CustomerId_992     -0.164688                   6     2181  2232.578568
CustomerId_993     -0.110207                   5     2202  2232.578568
CustomerId_994     -1.091831                 101     2177  2232.578568
CustomerId_996      0.201098                  17     2244  2232.578568
CustomerId_998     -0.387593                  22     2176  2232.578568

[3742 rows x 4 columns]


In [5]:
#WoE Binning
#To perform Weight of Evidence (WoE) binning, use a library or apply manually:

#pip install category_encoders
from category_encoders import WOEEncoder

# Assuming 'FraudResult' was your target variable and was one-hot encoded
# Find the one-hot encoded column related to 'FraudResult'
fraud_result_col = [col for col in df.columns if col.startswith('FraudResult_')]

# Check if 'FraudResult' was one-hot encoded
if fraud_result_col:
    # Use the one-hot encoded column for the target
    target_column = fraud_result_col[0] 
else:
    # If not one-hot encoded, check if it still exists
    if 'FraudResult' in df.columns:
        target_column = 'FraudResult'
    else:
        # If 'FraudResult' doesn't exist, provide feedback and handle appropriately
        raise KeyError("The 'FraudResult' column is not found in the DataFrame. Please check your data and preprocessing steps.")

# Find the one-hot encoded columns related to 'ProductCategory'
product_category_cols = [col for col in df.columns if col.startswith('ProductCategory_')]

# If there are one-hot encoded columns, use them for WOE encoding
if product_category_cols:
    woe = WOEEncoder(cols=product_category_cols)  
    # Use the target_column for fitting
    woe_bin = woe.fit_transform(df[product_category_cols], df[target_column])
    # Create a new column for the combined WOE values
    df['Binned_Feature'] = woe_bin.sum(axis=1)  
else:
    # If 'ProductCategory' still exists as a single column, use the original code
    woe = WOEEncoder(cols=['ProductCategory']) 
    # Use the target_column for fitting 
    woe_bin = woe.fit_transform(df['ProductCategory'], df[target_column])
    df['Binned_Feature'] = woe_bin

In [6]:
#Task 4 - Modeling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Splitting the Data
# Assuming 'FraudResult' was one-hot encoded as 'FraudResult_1'
# Adjust this if the encoded column has a different name
# Drop 'TransactionId' and all columns starting with 'BatchId'
X = df.drop(['FraudResult_1', 'TransactionId'] + [col for col in df.columns if col.startswith('BatchId')], axis=1) 

# If you need the original 'FraudResult' values for 'y', you'll need to
# reconstruct them from the one-hot encoded columns if they were encoded.
# For example, if 'FraudResult_1' represents the positive class:
y = df['FraudResult_1'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 4. Handle Missing Values
# Impute numerical columns with mean, EXCLUDING non-numeric columns
numeric_columns = df.select_dtypes(include=np.number).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())  
# Alternatively, drop if few
df.dropna(inplace=True)

# ----> Convert all columns to numeric before model fitting
X = X.apply(pd.to_numeric, errors='coerce').fillna(0) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a model (Random Forest and Logistic Regression)
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Evaluation Metrics
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"F1 Score: {f1_score(y_test, y_pred)}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_pred)}")
    print("\n")
    
# Example Hyperparameter Tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)

Model: Logistic Regression
Accuracy: 0.9981184341190613
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
ROC AUC: 0.5




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: Random Forest
Accuracy: 0.99989546856217
Precision: 0.9722222222222222
Recall: 0.9722222222222222
F1 Score: 0.9722222222222222
ROC AUC: 0.9860849289882647


Best parameters found:  {'max_depth': None, 'n_estimators': 100}


In [9]:
from flask import Flask, request, jsonify
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Initialize Flask app
app = Flask(__name__)

# Attempt to load model, encoder, and features
try:
    model, le, categorical_features = joblib.load('random_forest_model.pkl')
    print("Model and dependencies loaded successfully.")
except FileNotFoundError:
    print("Model file not found. Creating a new model...")
    # Example training data (replace with actual training data)
    X_train = pd.DataFrame({'feature1': ['A', 'B', 'C'], 'feature2': [4, 5, 6]})
    y_train = [0, 1, 0]
    categorical_features = ['feature1']  # Define your categorical features

    # Initialize and fit LabelEncoder
    le = LabelEncoder()
    X_train['feature1'] = le.fit_transform(X_train['feature1'])  # Encode categorical data

    # Train RandomForestClassifier
    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    # Save the model, encoder, and features together
    joblib.dump((model, le, categorical_features), 'random_forest_model.pkl')
    print("New model saved as random_forest_model.pkl.")

# Define API endpoint
@app.route('/predict', methods=['POST'])
def predict():
    try:
        # Parse JSON input
        data = request.json
        data_df = pd.DataFrame(data, index=[0])  # Convert to DataFrame

        # Encode categorical features
        for col in categorical_features:
            if col in data_df.columns:
                data_df[col] = le.transform(data_df[col])

        # Make prediction
        prediction = model.predict(data_df)
        return jsonify({'prediction': int(prediction[0])})
    except Exception as e:
        return jsonify({'error': str(e)})

# Run the Flask app
if __name__ == '__main__':
    app.run(debug=True)



ValueError: too many values to unpack (expected 3)