In [2]:
#Fraud Detection with Logistic Regression and Feature Engineering

#You are a data scientist at a financial institution, and your primary task is to develop a fraud detection model using logistic regression. The dataset you have is highly imbalanced, with only a small fraction of transactions being fraudulent. Your objective is to create an effective model by implementing logistic regression and employing various feature engineering techniques to improve the model's performance:

1. Data Preparation:

#a. Load the dataset, and provide an overview of the available features, including transaction

details, customer information, and labels (fraudulent or non-fraudulent).

#b. Describe the class distribution of fraudulent and non-fraudulent transactions and discuss the imbalance issue.

# Load the dataset
import pandas as pd
data = pd.read_csv('card_transdata.csv')
data


Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.311140,1.945940,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
999995,2.207101,0.112651,1.626798,1.0,1.0,0.0,0.0,0.0
999996,19.872726,2.683904,2.778303,1.0,1.0,0.0,0.0,0.0
999997,2.914857,1.472687,0.218075,1.0,1.0,0.0,1.0,0.0
999998,4.258729,0.242023,0.475822,1.0,0.0,0.0,1.0,0.0


In [3]:
# Class distribution
class_distribution = data['fraud'].value_counts()
print(class_distribution)

fraud
0.0    912597
1.0     87403
Name: count, dtype: int64


In [None]:
# 2. Initial Logistic Regression Model:

#a. Implement a basic logistic regression model using the raw dataset.

#b. Evaluate the model's performance using standard metrics like accuracy, precision, recall, and F1-score.

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the data into features (X) and the target variable (y)
X = data.drop(columns=['fraud'])
y = data['fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

In [9]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.96
Precision: 0.89
Recall: 0.60
F1 Score: 0.72


In [7]:
#3. Feature Engineering:
# Example: Create a new feature for transaction frequency
data['transaction_frequency'] = data.groupby('distance_from_home')['distance_from_last_transaction'].transform('count')


In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data['normalized_amount'] = scaler.fit_transform(data['distance_from_home'].values.reshape(-1, 1))


In [17]:
# Check the column names in your DataFrame
print(data.columns)

# Replace 'missing_column' with the actual column name
column_name = 'repeat_retailer'  # Replace with the correct column name
data[column_name].fillna(data[column_name].mean(), inplace=True)



Index(['distance_from_home', 'distance_from_last_transaction',
       'ratio_to_median_purchase_price', 'repeat_retailer', 'used_pin_number',
       'online_order', 'fraud', 'transaction_frequency', 'normalized_amount',
       'used_chip_1.0'],
      dtype='object')


In [15]:
# Example: One-hot encoding a categorical variable
data = pd.get_dummies(data, columns=['used_chip'], drop_first=True)


In [20]:
#4
from imblearn.over_sampling import SMOTE

oversampler = SMOTE(sampling_strategy=0.5)
X_resampled, y_resampled = oversampler.fit_resample(X_train,y_train)

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\mevis\AppData\Roaming\Python\Python311\site-packages\sklearn\utils\_param_validation.py)

In [22]:
#Q5.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE  # Import SMOTE for oversampling

# Assuming you have already performed the feature engineering steps mentioned earlier.

# Load the dataset with the feature-engineered data
df = pd.read_csv('feature_engineered_data.csv')

# Split the data into features (X) and the target variable (y)
X = df.drop(columns=['fraud'])
y = df['fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE (Synthetic Minority Over-sampling Technique) to balance the dataset
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Create and train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_resampled, y_train_resampled)

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\mevis\AppData\Roaming\Python\Python311\site-packages\sklearn\utils\_param_validation.py)

In [21]:
#6. Model Interpretation:

#a. Interpret the coefficients of the logistic regression model and discuss which features have the most influence on fraud detection.

coefficients = model.coef_
feature_names = X.columns

coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients[0]})
coef_df = coef_df.sort_values(by='Coefficient', ascending=False)

print(coef_df)

                          Feature  Coefficient
6                    online_order     6.629331
2  ratio_to_median_purchase_price     0.858475
1  distance_from_last_transaction     0.025413
0              distance_from_home     0.015102
3                 repeat_retailer    -0.613539
4                       used_chip    -1.040337
7           transaction_frequency    -5.149107
5                 used_pin_number   -13.203596
