In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import uniform, randint

### Loading data

In [2]:
train_data = pd.read_csv("C:/Users/johne/Downloads/playground-series-s4e7/train.csv")
test_data = pd.read_csv("C:/Users/johne/Downloads/playground-series-s4e7/test.csv")

# Display the first few rows of each dataframe
print(train_data.head())
print(test_data.head())

   id  Gender  Age  Driving_License  Region_Code  Previously_Insured  \
0   0    Male   21                1         35.0                   0   
1   1    Male   43                1         28.0                   0   
2   2  Female   25                1         14.0                   1   
3   3  Female   35                1          1.0                   0   
4   4  Female   36                1         15.0                   1   

  Vehicle_Age Vehicle_Damage  Annual_Premium  Policy_Sales_Channel  Vintage  \
0    1-2 Year            Yes         65101.0                 124.0      187   
1   > 2 Years            Yes         58911.0                  26.0      288   
2    < 1 Year             No         38043.0                 152.0      254   
3    1-2 Year            Yes          2630.0                 156.0       76   
4    1-2 Year             No         31951.0                 152.0      294   

   Response  
0         0  
1         1  
2         0  
3         0  
4         0  
        

In [7]:


# Split features and target
X = train_data.drop(['Response', 'id'], axis=1)
y = train_data['Response']
test_ids = test_data['id']
X_test = test_data.drop(['id'], axis=1)

# Identify numerical and categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include='object').columns

# Feature Engineering: Create interaction terms or modify existing features
X['Age_Vintage_Interaction'] = X['Age'] * X['Vintage']
X_test['Age_Vintage_Interaction'] = X_test['Age'] * X_test['Vintage']

# Preprocessing pipelines for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

# Fit the preprocessor and transform the data
X_transformed = preprocessor.fit_transform(X)
X_test_transformed = preprocessor.transform(X_test)

# Sample a subset of the data for hyperparameter tuning
X_sample, _, y_sample, _ = train_test_split(X_transformed, y, train_size=0.1, stratify=y, random_state=42)

# Define the model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)

# Hyperparameter Tuning with RandomizedSearchCV
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.2),
    'subsample': uniform(0.7, 0.3)
}

random_search = RandomizedSearchCV(model, param_dist, n_iter=10, cv=3, scoring='roc_auc', n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_sample, y_sample)
print(f'Best parameters: {random_search.best_params_}')
print(f'Best cross-validation ROC AUC: {random_search.best_score_}')

# Train the final model with best parameters
best_model = random_search.best_estimator_
best_model.fit(X_transformed, y)

# Predict on validation set and evaluate
X_train, X_valid, y_train, y_valid = train_test_split(X_transformed, y, test_size=0.2, random_state=42, stratify=y)
y_valid_pred = best_model.predict_proba(X_valid)[:, 1]
roc_auc = roc_auc_score(y_valid, y_valid_pred)
print(f'Validation ROC AUC: {roc_auc}')

# Error Analysis
y_valid_pred_labels = best_model.predict(X_valid)
print(classification_report(y_valid, y_valid_pred_labels))
conf_matrix = confusion_matrix(y_valid, y_valid_pred_labels)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Response', 'Response'], yticklabels=['No Response', 'Response'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Predict on the test set
test_predictions = best_model.predict_proba(X_test_transformed)[:, 1]

# Prepare the submission file
submission = pd.DataFrame({'id': test_ids, 'Response': test_predictions})
submission_path = "C:/Users/johne/Downloads/submission.csv"
submission.to_csv(submission_path, index=False)
print(f'Submission file created successfully at {submission_path}')

Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters: {'learning_rate': 0.1323306320976562, 'max_depth': 7, 'n_estimators': 138, 'subsample': 0.7873687420594125}
Best cross-validation ROC AUC: 0.8756945223640852


Parameters: { "use_label_encoder" } are not used.

