In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from geopy.distance import geodesic
from sklearn.model_selection import train_test_split, cross_val_score
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, classification_report

# Step 1: Load the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# Step 2: Feature Engineering
train_copy = train.copy()
test_copy = test.copy()

# Extract day, month, and hour from date/time columns
for df in [train_copy, test_copy]:
    df['trans_date'] = pd.to_datetime(df['trans_date'], errors='coerce')
    df['trans_time'] = pd.to_datetime(df['trans_time'], errors='coerce', format='%H:%M:%S').dt.hour
    df['day'] = df['trans_date'].dt.day
    df['month'] = df['trans_date'].dt.month
    df['year'] = df['trans_date'].dt.year

# Calculate age
for df in [train_copy, test_copy]:
    df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
    df['age'] = df['year'] - df['dob'].dt.year

# Calculate distance between cardholder and merchant
for df in [train_copy, test_copy]:
    df['distance'] = df.apply(
        lambda row: geodesic((row['lat'], row['long']), (row['merch_lat'], row['merch_long'])).km, axis=1
    )

# Encode categorical variables
label_encoders = {}
for col in ['category', 'state', 'gender', 'job']:
    le = LabelEncoder()
    le.fit(train_copy[col].fillna('Unknown'))
    train_copy[col] = le.transform(train_copy[col].fillna('Unknown'))
    test_copy[col] = le.transform(test_copy[col].fillna('Unknown'))
    label_encoders[col] = le

# Drop unnecessary columns
columns_to_drop = ['trans_num', 'trans_date', 'cc_num', 'first', 'last', 'street', 'city', 'zip', 'dob', 'merchant']
train_copy = train_copy.drop(columns=columns_to_drop)
test_copy = test_copy.drop(columns=columns_to_drop)

# Step 3: Train-Test Split
X = train_copy.drop(columns=['is_fraud'])
y = train_copy['is_fraud']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Feature Scaling
scaler = StandardScaler()
numerical_cols = ['amt', 'distance', 'age']
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_valid[numerical_cols] = scaler.transform(X_valid[numerical_cols])
test_copy[numerical_cols] = scaler.transform(test_copy[numerical_cols])

# Step 5: Add Polynomial Features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = pd.DataFrame(poly.fit_transform(X_train[numerical_cols]), columns=poly.get_feature_names_out(numerical_cols))
X_valid_poly = pd.DataFrame(poly.transform(X_valid[numerical_cols]), columns=poly.get_feature_names_out(numerical_cols))
test_poly = pd.DataFrame(poly.transform(test_copy[numerical_cols]), columns=poly.get_feature_names_out(numerical_cols))

# Drop original numerical columns to avoid duplication
X_train = X_train.drop(columns=numerical_cols)
X_valid = X_valid.drop(columns=numerical_cols)
test_copy = test_copy.drop(columns=numerical_cols)

# Merge polynomial features with remaining features
X_train = pd.concat([X_train.reset_index(drop=True), X_train_poly.reset_index(drop=True)], axis=1)
X_valid = pd.concat([X_valid.reset_index(drop=True), X_valid_poly.reset_index(drop=True)], axis=1)
test_copy = pd.concat([test_copy.reset_index(drop=True), test_poly.reset_index(drop=True)], axis=1)

# Step 6: Model Training with LightGBM
model = LGBMClassifier(
    num_leaves=31,
    max_depth=7,
    learning_rate=0.05,
    n_estimators=200,
    subsample=0.8,
    random_state=42
)

# Evaluate with cross-validation
cv_scores = cross_val_score(model, X, y, scoring='f1', cv=5)
print("Cross-validated F1 Score:", np.mean(cv_scores))

# Train the model
model.fit(X_train, y_train)

# Step 7: Validate the Model
y_pred = model.predict(X_valid)
print("F1 Score:", f1_score(y_valid, y_pred))
print("Classification Report:")
print(classification_report(y_valid, y_pred))

# Step 8: Analyze Feature Importance
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print(feature_importances)

# Drop features with low importance
low_importance_features = feature_importances[feature_importances['Importance'] < 1]['Feature']
X_train = X_train.drop(columns=low_importance_features)
X_valid = X_valid.drop(columns=low_importance_features)
test_copy = test_copy.drop(columns=low_importance_features)

# Retrain the model with reduced features
model.fit(X_train, y_train)

# Step 9: Make Predictions on Test Data
test_predictions = model.predict(test_copy)

# Step 10: Create Submission File
submission = sample_submission.copy()
submission['is_fraud'] = test_predictions
submission.to_csv("submission.csv", index=False)
print("Submission file 'submission.csv' created successfully!")

[LightGBM] [Info] Number of positive: 33839, number of negative: 262723
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032360 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2738
[LightGBM] [Info] Number of data points in the train set: 296562, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.114104 -> initscore=-2.049486
[LightGBM] [Info] Start training from score -2.049486
[LightGBM] [Info] Number of positive: 33839, number of negative: 262723
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014158 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2738
[LightGBM] [Info] Number of data points in the train set: 296562, number of used features: 16
[LightGBM] [In