In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
import numpy as np


### Loading data

In [64]:
# Comp 1
#train_data = pd.read_csv("C:/Users/johne/Downloads/playground-series-s4e7/train.csv")
#test_data = pd.read_csv("C:/Users/johne/Downloads/playground-series-s4e7/test.csv")

# Comp 2
train_data = pd.read_csv("C:/Users/matt/Downloads/train.csv/train.csv")
test_data = pd.read_csv("C:/Users/matt/Downloads/test.csv/test.csv")


# Display the first few rows of each dataframe
print(train_data.head())
print(test_data.head())

   id  Gender  Age  Driving_License  Region_Code  Previously_Insured  \
0   0    Male   21                1         35.0                   0   
1   1    Male   43                1         28.0                   0   
2   2  Female   25                1         14.0                   1   
3   3  Female   35                1          1.0                   0   
4   4  Female   36                1         15.0                   1   

  Vehicle_Age Vehicle_Damage  Annual_Premium  Policy_Sales_Channel  Vintage  \
0    1-2 Year            Yes         65101.0                 124.0      187   
1   > 2 Years            Yes         58911.0                  26.0      288   
2    < 1 Year             No         38043.0                 152.0      254   
3    1-2 Year            Yes          2630.0                 156.0       76   
4    1-2 Year             No         31951.0                 152.0      294   

   Response  
0         0  
1         1  
2         0  
3         0  
4         0  
        

In [65]:

# Split features and target
X = train_data.drop(['Response', 'id'], axis=1)
y = train_data['Response']
test_ids = test_data['id']
X_test = test_data.drop(['id'], axis=1)

# Identify numerical and categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [66]:
# Check the data types of Region_Code and Age
print(train_data[['Region_Code', 'Age']].dtypes)
print(test_data[['Region_Code', 'Age']].dtypes)

Region_Code    float64
Age              int64
dtype: object
Region_Code    float64
Age              int64
dtype: object


In [67]:
# Feature Engineering
# Initialize the scaler
scaler = MinMaxScaler()

# Define function to normalize columns and create a combined column
def normalize_and_combine(df):
    # Normalize 'Age' column and create a new column 'Age_Normalized'
    df['Age_Normalized'] = scaler.fit_transform(df[['Age']])
    
    # Normalize 'Vintage' column and create a new column 'Vintage_Normalized'
    df['Vintage_Normalized'] = scaler.fit_transform(df[['Vintage']])
    
    # Combine 'Age_Normalized' and 'Vintage_Normalized' into a new column 'Age_Vintage'
    df['Age_Vintage'] = df['Age_Normalized'] + df['Vintage_Normalized']
    
    return df

# Apply the function to train_data
train_data = normalize_and_combine(train_data)

# Apply the function to test_data
test_data = normalize_and_combine(test_data)

# Display the first few rows of train_data to verify the results
train_data.head()

# Display the first few rows of test_data to verify the results
test_data.head()




Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Age_Normalized,Vintage_Normalized,Age_Vintage
0,11504798,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228,0.0,0.754325,0.754325
1,11504799,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123,0.415385,0.391003,0.806388
2,11504800,Male,47,1,43.0,0,1-2 Year,Yes,2630.0,26.0,271,0.415385,0.903114,1.318499
3,11504801,Female,22,1,47.0,1,< 1 Year,No,24502.0,152.0,115,0.030769,0.363322,0.394091
4,11504802,Male,51,1,19.0,0,1-2 Year,No,34115.0,124.0,148,0.476923,0.477509,0.954432


### Preprocessing

In [68]:
# Preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])


### Defining Model

In [69]:
# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])


### Training 

In [70]:
# Split data for validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [71]:
# Train the model
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



### Testing

In [72]:
# Predict on validation set and evaluate
y_valid_pred = model.predict_proba(X_valid)[:, 1]
roc_auc = roc_auc_score(y_valid, y_valid_pred)
print(f'Validation ROC AUC: {roc_auc}')

Validation ROC AUC: 0.8783662809895941


In [73]:
# Predict on the test set
test_predictions = model.predict_proba(X_test)[:, 1]

### Preparing Submission

In [74]:
# Prepare the submission file
submission = pd.DataFrame({'id': test_ids, 'Response': test_predictions})
submission.to_csv('C:/Users/matt/Downloads/submission4.csv', index=False)
print('Submission file created successfully!')

Submission file created successfully!


## Final Submission Score:
### 0.87866 - Top 40%