In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the dataset
file_path = 'credit_card_fraud_dataset.csv'  # Update with your file path
df = pd.read_csv(file_path)

# Selecting fraud and non-fraud transactions
fraud_data = df[df['IsFraud'] == 1]
non_fraud_data = df[df['IsFraud'] == 0].sample(n=2000, random_state=42)

# Combine fraud and sampled non-fraud data
balanced_data = pd.concat([fraud_data, non_fraud_data])

# Drop irrelevant columns
balanced_data = balanced_data.drop(columns=['TransactionID', 'TransactionDate'])

# Convert categorical columns to numerical using one-hot encoding
balanced_data = pd.get_dummies(balanced_data, columns=['TransactionType', 'Location'], drop_first=True)

# Separate features and target variable
X = balanced_data.drop(columns=['IsFraud'])
y = balanced_data['IsFraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train a RandomForest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Model Accuracy:", accuracy)
print("Classification Report:\n", report)

# Save the trained model
model_filename = 'fraud_detection_model.pkl'
joblib.dump(model, model_filename)
print(f"Model saved to {model_filename}")


Model Accuracy: 0.6116666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.83      0.74       400
           1       0.34      0.17      0.23       200

    accuracy                           0.61       600
   macro avg       0.50      0.50      0.48       600
weighted avg       0.56      0.61      0.57       600

Model saved to fraud_detection_model.pkl


In [14]:
import pandas as pd
import joblib

# Load the trained model
model_filename = 'fraud_detection_model.pkl'
model = joblib.load(model_filename)

# Define the function to preprocess input data and make predictions
def preprocess_and_predict(input_data):
    # Convert input data to DataFrame
    input_df = pd.DataFrame([input_data])
    
    # Drop irrelevant columns if present
    if 'TransactionID' in input_df.columns:
        input_df = input_df.drop(columns=['TransactionID'])
    if 'TransactionDate' in input_df.columns:
        input_df = input_df.drop(columns=['TransactionDate'])
    
    # Convert categorical columns to numerical using one-hot encoding
    input_df = pd.get_dummies(input_df, columns=['TransactionType', 'Location'], drop_first=True)
    
    # Ensure the input data has the same columns as the training data
    # Create a DataFrame with the same columns as the training data
    training_columns = model.feature_names_in_
    input_df = input_df.reindex(columns=training_columns, fill_value=0)
    
    # Make prediction
    prediction = model.predict(input_df)
    probability = model.predict_proba(input_df)
    
    return prediction[0], probability[0][1]


# Example usage
input_data = {    
    'Amount': 20001.32,
    'MerchantID': 826,
    'TransactionType': 'purchase',
    'Location': 'San Jose'
}

prediction, fraud_probability = preprocess_and_predict(input_data)
print("Prediction (1 for fraud, 0 for non-fraud):", prediction)
print("Probability of being fraud:", fraud_probability)

Prediction (1 for fraud, 0 for non-fraud): 1
Probability of being fraud: 0.58


In [17]:
input_data = {    
    'Amount': 20001.32,
    'MerchantID': 826,
    'TransactionType': 'purchase',
    'Location': 'San Jose'
}

print(type(input_data['Amount']))

<class 'float'>
