In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

def create_features(df, is_training=True):
    """
    Create optimized feature set
    """
    # Time features
    df['trans_datetime'] = pd.to_datetime(df['trans_date'] + ' ' + df['trans_time'])
    df['trans_hour'] = df['trans_datetime'].dt.hour
    df['trans_day'] = df['trans_datetime'].dt.day
    
    # Age calculation
    df['dob'] = pd.to_datetime(df['dob'])
    df['age'] = df['trans_datetime'].dt.year - df['dob'].dt.year
    df['age'] -= ((df['trans_datetime'].dt.month < df['dob'].dt.month) |
                  ((df['trans_datetime'].dt.month == df['dob'].dt.month) &
                   (df['trans_datetime'].dt.day < df['dob'].dt.day))).astype(int)
    
    # Category encoding
    le = LabelEncoder()
    df['category_code'] = le.fit_transform(df['category'])
    
    # Gender encoding
    df['gender'] = (df['gender'] == 'M').astype(int)
    
    # Select final features
    selected_features = [
        'amt', 'trans_hour', 'trans_day', 'age', 
        'category_code', 'gender'
    ]
    
    return df[selected_features]

# Load data
print("Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Prepare features
print("Preparing features...")
X = create_features(train)
y = train['is_fraud']

# Scale features
print("Scaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create pipeline with SMOTE and RandomForest
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42, sampling_strategy=0.3)),
    ('classifier', RandomForestClassifier(
        n_estimators=220,
        random_state=42,
        class_weight=None,
        n_jobs=-1
    ))
])

# Perform cross-validation
print("Performing cross-validation...")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X_scaled, y, scoring='f1', cv=cv)

print("Cross-validation F1 scores:", cv_scores)
print("Average F1 score: {:.4f} (+/- {:.4f})".format(
    cv_scores.mean(), cv_scores.std() * 2))

# Train final model
print("Training final model...")
pipeline.fit(X_scaled, y)

# Process test data and make predictions
print("Processing test data and making predictions...")
X_test = create_features(test)
X_test_scaled = scaler.transform(X_test)
test_predictions = pipeline.predict(X_test_scaled)

# Create submission file
print("Creating submission file...")
submission = pd.DataFrame({
    'id': test['id'],
    'is_fraud': test_predictions
})
submission.to_csv('submission_rf_smote.csv', index=False)

# Print prediction distribution
print("\nPrediction distribution in test set:")
print(pd.Series(test_predictions).value_counts(normalize=True))

# Print feature importance
print("\nFeature importance:")
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': pipeline.named_steps['classifier'].feature_importances_
}).sort_values('importance', ascending=False)
print(importance_df.to_string(index=False))
