In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import lightgbm as lgb

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth radius in km
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)

    a = (np.sin(dlat/2)**2) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * (np.sin(dlon/2)**2)
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

def preprocess_data(df):
    # Convert date and time features
    df['trans_date_dt'] = pd.to_datetime(df['trans_date'], errors='coerce')
    df['trans_hour'] = pd.to_datetime(df['trans_time'], format='%H:%M:%S', errors='coerce').dt.hour
    df['trans_day'] = df['trans_date_dt'].dt.day
    df['trans_month'] = df['trans_date_dt'].dt.month
    df['trans_year'] = df['trans_date_dt'].dt.year
    df['day_of_week'] = df['trans_date_dt'].dt.dayofweek

    # Distance feature (Euclidean approximation)
    # For a more accurate measure, use haversine_distance:
    #df['distance'] = haversine_distance(df['lat'], df['long'], df['merch_lat'], df['merch_long'])
    df['distance'] = np.sqrt((df['lat'] - df['merch_lat'])**2 + (df['long'] - df['merch_long'])**2)

    # Encode categorical variables
    categorical_columns = ['category', 'gender', 'state', 'job']
    for col in categorical_columns:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))

    return df

def add_aggregates(train_df, test_df):
    agg = train_df.groupby('cc_num')['amt'].agg(['mean', 'median', 'std', 'count', 'max', 'min']).reset_index()
    agg.columns = ['cc_num', 'cc_mean', 'cc_median', 'cc_std', 'cc_count', 'cc_max', 'cc_min']

    # Merge aggregates back
    train_df = train_df.merge(agg, on='cc_num', how='left')
    test_df = test_df.merge(agg, on='cc_num', how='left')

    # Ratio features
    for df in [train_df, test_df]:
        df['amt_to_mean'] = df['amt'] / (df['cc_mean'] + 1e-5)
        df['amt_to_median'] = df['amt'] / (df['cc_median'] + 1e-5)
        df['amt_to_max'] = df['amt'] / (df['cc_max'] + 1e-5)

    return train_df, test_df

def final_cleanup(df, is_train=True):
    drop_columns = ['trans_num', 'trans_date', 'trans_time', 'first', 'last', 'dob', 'merchant',
                    'street', 'city', 'state', 'zip', 'job', 'trans_date_dt', 'cc_num']
    df.drop(columns=[c for c in drop_columns if c in df.columns], inplace=True, errors='ignore')

    imputer = SimpleImputer(strategy='mean')
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

    if is_train:
        y = df['is_fraud']
        df.drop('is_fraud', axis=1, inplace=True)
        return df, y
    else:
        return df

# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

train_df, test_df = add_aggregates(train_df, test_df)
X, y = final_cleanup(train_df, is_train=True)
X_test = final_cleanup(test_df, is_train=False)

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# LightGBM Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
models = []

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': 42,
    'verbose': -1
}

for fold, (trn_idx, val_idx) in enumerate(skf.split(X_scaled, y)):
    X_train_fold, X_val_fold = X_scaled[trn_idx], X_scaled[val_idx]
    y_train_fold, y_val_fold = y.iloc[trn_idx], y.iloc[val_idx]

    dtrain = lgb.Dataset(X_train_fold, label=y_train_fold)
    dval = lgb.Dataset(X_val_fold, label=y_val_fold, reference=dtrain)

    model = lgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        valid_sets=[dval],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(100),
            lgb.log_evaluation(100)
        ]
    )

    val_preds = model.predict(X_val_fold, num_iteration=model.best_iteration)
    oof_preds[val_idx] = val_preds
    models.append(model)

# Find the best threshold using OOF predictions
thresholds = np.linspace(0, 1, 101)
best_f1 = 0
best_thresh = 0.5
for thr in thresholds:
    y_oof_pred = (oof_preds > thr).astype(int)
    score = f1_score(y, y_oof_pred)
    if score > best_f1:
        best_f1 = score
        best_thresh = thr

print("Best Threshold from OOF:", best_thresh)
print("OOF F1 Score:", best_f1)

# Evaluate on OOF data
y_oof_pred = (oof_preds > best_thresh).astype(int)
print("Classification Report (OOF):\n", classification_report(y, y_oof_pred))
print("Confusion Matrix (OOF):\n", confusion_matrix(y, y_oof_pred))

# Predict on test data by averaging predictions from all folds
test_preds = np.zeros(len(X_test_scaled))
for m in models:
    test_preds += m.predict(X_test_scaled, num_iteration=m.best_iteration) / len(models)

y_test_pred = (test_preds > best_thresh).astype(int)

# Create submission
submission = pd.DataFrame({'id': test_df['id'], 'is_fraud': y_test_pred})
submission.to_csv('submission.csv', index=False)
print("Submission file created with tuned threshold from OOF predictions.")

Training until validation scores don't improve for 100 rounds
[100]	valid's binary_logloss: 0.0417091
[200]	valid's binary_logloss: 0.0257679
[300]	valid's binary_logloss: 0.0225499
[400]	valid's binary_logloss: 0.0209275
[500]	valid's binary_logloss: 0.0200807
[600]	valid's binary_logloss: 0.0196075
[700]	valid's binary_logloss: 0.0192866
[800]	valid's binary_logloss: 0.0189679
[900]	valid's binary_logloss: 0.0187957
[1000]	valid's binary_logloss: 0.0187086
Did not meet early stopping. Best iteration is:
[1000]	valid's binary_logloss: 0.0187086
Training until validation scores don't improve for 100 rounds
[100]	valid's binary_logloss: 0.0406628
[200]	valid's binary_logloss: 0.0251633
[300]	valid's binary_logloss: 0.0224185
[400]	valid's binary_logloss: 0.0209055
[500]	valid's binary_logloss: 0.0202688
[600]	valid's binary_logloss: 0.019621
[700]	valid's binary_logloss: 0.0193418
[800]	valid's binary_logloss: 0.0190651
[900]	valid's binary_logloss: 0.0188436
[1000]	valid's binary_loglo