In [458]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

In [459]:
# Load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train['Policy Start Date'] = pd.to_datetime(train['Policy Start Date'])
test['Policy Start Date'] = pd.to_datetime(test['Policy Start Date'])

In [460]:
# Extract date features
for df in [train, test]:
    # Extract date components
    df['Policy_Month'] = df['Policy Start Date'].dt.month
    df['Policy_Year'] = df['Policy Start Date'].dt.year
    df['Policy_Day'] = df['Policy Start Date'].dt.day
    df['Policy_DayOfWeek'] = df['Policy Start Date'].dt.dayofweek

    # Apply sine and cosine transformations
    df['Policy_Month_sin'] = np.sin(2 * np.pi * df['Policy_Month'] / 12)
    df['Policy_Month_cos'] = np.cos(2 * np.pi * df['Policy_Month'] / 12)

    df['Policy_Year_sin'] = np.sin(2 * np.pi * (df['Policy_Year'] - df['Policy_Year'].min()) / (df['Policy_Year'].max() - df['Policy_Year'].min()))
    df['Policy_Year_cos'] = np.cos(2 * np.pi * (df['Policy_Year'] - df['Policy_Year'].min()) / (df['Policy_Year'].max() - df['Policy_Year'].min()))


    df['Policy_DayOfWeek_sin'] = np.sin(2 * np.pi * df['Policy_DayOfWeek'] / 7)
    df['Policy_DayOfWeek_cos'] = np.cos(2 * np.pi * df['Policy_DayOfWeek'] / 7)

In [461]:
# 2. Handle categorical variables
categorical_columns = ['Gender', 'Marital Status', 'Education Level', 'Occupation', 
                      'Location', 'Policy Type', 'Customer Feedback', 'Smoking Status', 
                      'Exercise Frequency', 'Property Type']

# Label encode categorical variables
le_dict = {}
for col in categorical_columns:
    le_dict[col] = LabelEncoder()
    if col in train.columns:
        train[col] = train[col].fillna('Missing')
        test[col] = test[col].fillna('Missing')
        le_dict[col].fit(list(train[col].unique()) + list(test[col].unique()))
        train[col] = le_dict[col].transform(train[col])
        test[col] = le_dict[col].transform(test[col])


In [462]:
# 3. Handle numerical variables
numerical_columns = ['Age', 'Annual Income', 'Number of Dependents', 'Health Score',
                    'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration']

# Fill missing values with median
for col in numerical_columns:
    median_val = train[col].median()
    train[col] = train[col].fillna(median_val)
    test[col] = test[col].fillna(median_val)
    
# Scale numerical features
scaler = StandardScaler()
train[numerical_columns] = scaler.fit_transform(train[numerical_columns])
test[numerical_columns] = scaler.transform(test[numerical_columns])

In [463]:
# Add new features to the training dataset
train['Previous Claims / Age'] = train['Previous Claims'] / (train['Age'] + 1)  # Avoid division by zero
train['Previous Claims^2'] = train['Previous Claims']**2
train['Age * Previous Claims'] = train['Age'] * train['Previous Claims']
train['Previous Claims / Number of Dependents'] = train['Previous Claims'] / (train['Number of Dependents'] + 1)  # Avoid division by zero
train['Number of Dependents * Previous Claims'] = train['Number of Dependents'] * train['Previous Claims']
train['Health Score * Previous Claims'] = train['Health Score'] * train['Previous Claims']
train['Previous Claims / Health Score'] = train['Previous Claims'] / (train['Health Score'] + 1)  # Avoid division by zero
train['Credit Score / Previous Claims'] = train['Credit Score'] / (train['Previous Claims'] + 1)  # Avoid division by zero
train['Insurance Duration * Previous Claims'] = train['Insurance Duration'] * train['Previous Claims']
train['Health Score * Credit Score * Previous Claims'] = train['Health Score'] * train['Credit Score'] * train['Previous Claims']
train['Annual Income * Previous Claims'] = train['Annual Income'] * train['Previous Claims']
train['Smoking_Health_Interaction'] = (train['Smoking Status']+1) * train['Health Score']**2
train['Smoking_Age_Health_Interaction'] = (train['Smoking Status']+1) * train['Health Score']**2 / train['Age']**(1/2)
 
# Add new features to the testing dataset
test['Previous Claims / Age'] = test['Previous Claims'] / (test['Age'] + 1)  # Avoid division by zero
test['Previous Claims^2'] = test['Previous Claims']**2
test['Age * Previous Claims'] = test['Age'] * test['Previous Claims']
test['Previous Claims / Number of Dependents'] = test['Previous Claims'] / (test['Number of Dependents'] + 1)  # Avoid division by zero
test['Number of Dependents * Previous Claims'] = test['Number of Dependents'] * test['Previous Claims']
test['Health Score * Previous Claims'] = test['Health Score'] * test['Previous Claims']
test['Previous Claims / Health Score'] = test['Previous Claims'] / (test['Health Score'] + 1)  # Avoid division by zero
test['Credit Score / Previous Claims'] = test['Credit Score'] / (test['Previous Claims'] + 1)  # Avoid division by zero
test['Insurance Duration * Previous Claims'] = test['Insurance Duration'] * test['Previous Claims']
test['Health Score * Credit Score * Previous Claims'] = test['Health Score'] * test['Credit Score'] * test['Previous Claims']
test['Annual Income * Previous Claims'] = test['Annual Income'] * test['Previous Claims']
test['Smoking_Health_Interaction'] = (test['Smoking Status']+1) * test['Health Score']**2
test['Smoking_Age_Health_Interaction'] = (test['Smoking Status']+1) * test['Health Score']**2 / test['Age']**(1/2)

In [464]:
# Prepare features and target
# Add all new features to the features list
new_features = [
    'Previous Claims / Age',
    'Previous Claims^2',
    'Age * Previous Claims',
    'Previous Claims / Number of Dependents',
    'Number of Dependents * Previous Claims',
    'Health Score * Previous Claims',
    'Previous Claims / Health Score',
    'Credit Score / Previous Claims',
    'Insurance Duration * Previous Claims',
    'Health Score * Credit Score * Previous Claims',
    'Annual Income * Previous Claims',
    'Smoking_Health_Interaction',
    'Smoking_Age_Health_Interaction',
]

chronological_features = ['Policy_Month', 'Policy_Month_sin', 'Policy_Month_cos',
    'Policy_Year', 'Policy_Year_sin', 'Policy_Year_cos',
    'Policy_Day',
    'Policy_DayOfWeek', 'Policy_DayOfWeek_sin', 'Policy_DayOfWeek_cos']

features = numerical_columns + categorical_columns + chronological_features + new_features
X = train[features]
y = np.log1p(train['Premium Amount'])  # Log transform target for better distribution

In [465]:
X

Unnamed: 0,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Gender,Marital Status,...,Previous Claims / Number of Dependents,Number of Dependents * Previous Claims,Health Score * Previous Claims,Previous Claims / Health Score,Credit Score / Previous Claims,Insurance Duration * Previous Claims,Health Score * Credit Score * Previous Claims,Annual Income * Previous Claims,Smoking_Health_Interaction,Smoking_Age_Health_Interaction
0,-1.648301,-0.707414,-0.746862,-0.249610,1.216739,1.286338,-1.567375,-0.007023,0,1,...,4.806619,-0.908736,-0.303710,1.621475,-0.707063,-0.008545,0.476027,-0.860738,0.062305,
1,-0.159542,-0.023289,0.733500,-0.844110,-0.002284,0.420713,0.714630,-1.163391,0,0,...,-0.001317,-0.001675,0.001928,-0.014649,0.716266,0.002657,0.001378,0.000053,1.425044,
2,-1.350549,-0.215473,0.733500,1.829212,-0.002284,0.766963,0.013020,-0.777935,1,0,...,-0.001317,-0.001675,-0.004177,-0.000807,0.013050,0.001777,-0.000054,0.000492,6.692034,
3,-1.499425,3.461605,-0.006681,-1.235840,-0.002284,-1.656788,-1.602810,-1.548847,1,1,...,-0.002299,0.000015,0.002822,0.009683,-1.606478,0.003537,-0.004523,-0.007905,3.054600,
4,-1.499425,0.228896,-0.746862,-0.437598,-1.221307,-0.271787,0.034281,-0.392479,1,3,...,-4.824661,0.912147,0.534441,-2.171591,-0.154902,0.479337,0.018321,-0.279552,0.382984,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,-0.382856,-0.161259,-1.487043,-0.996082,-0.002284,-0.791163,-1.567375,-0.777935,0,1,...,0.004689,0.003396,0.002275,-0.582804,-1.570962,0.001777,-0.003565,0.000368,0.992179,
1199996,0.957026,0.106646,-0.006681,-1.189716,-0.002284,0.074463,0.027194,-0.392479,1,0,...,-0.002299,0.000015,0.002717,0.012037,0.027256,0.000896,0.000074,-0.000244,1.415425,1.446855
1199997,-1.648301,0.615826,-1.487043,-0.915601,-1.221307,1.632588,0.013020,0.378433,1,0,...,2.507597,1.816135,1.118229,-14.470555,-0.058832,-0.462183,0.014559,-0.752112,0.838324,
1199998,1.031464,-0.268959,-0.746862,-0.592267,-0.002284,-0.444912,-1.319331,-0.392479,1,3,...,-0.009021,0.001706,0.001353,-0.005601,-1.322351,0.000896,-0.001784,0.000614,0.350780,0.345388


In [None]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Define the parameter distribution
param_distributions = {
    'max_depth': [3, 5, 6, 7, 9],              # Tree depth
    'learning_rate': np.linspace(0.01, 0.2, 10),  # Learning rate
    'n_estimators': [100, 200, 500, 1000],    # Number of trees
    'min_child_weight': [1, 3, 5],            # Minimum child weight
    'subsample': [0.6, 0.8, 1.0],             # Fraction of samples used for each tree
    'colsample_bytree': [0.6, 0.8, 1.0],      # Fraction of features used for each tree
    'reg_alpha': [0, 0.1, 0.5, 1.0],          # L1 regularization term
    'reg_lambda': [0.5, 1.0, 2.0]             # L2 regularization term
}

# Initialize the XGBoost model
xgb_model = XGBRegressor(random_state=42)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions,
    n_iter=50,  # Number of random combinations to try
    scoring='r2',  # Metric for evaluation
    cv=5,  # 5-fold cross-validation
    verbose=1,  # Print progress
    random_state=42,  # Reproducibility
    n_jobs=-1  # Use all available CPU cores
)

# Perform the search
print("Starting Randomized Search...")
random_search.fit(X_train, y_train)
print("Randomized Search completed.")

# Best parameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

print("\nBest Hyperparameters:")
print(best_params)

# Evaluate the best model on the validation set
y_pred = best_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("\nValidation Results:")
print('Mean Squared Error:', mse)
print('R² Score:', r2)

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Use the best parameters from the search
best_params = {
    'subsample': 1.0,
    'reg_lambda': 2.0,
    'reg_alpha': 0.5,
    'n_estimators': 100,
    'min_child_weight': 3,
    'max_depth': 9,
    'learning_rate': 0.03111111111111111,
    'colsample_bytree': 0.8,
}

# Initialize the final model
final_model = XGBRegressor(**best_params, random_state=42)

# Train the model on the training data
final_model.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred = final_model.predict(X_val)

# Calculate performance metrics
final_mse = mean_squared_error(y_val, y_val_pred)
final_r2 = r2_score(y_val, y_val_pred)

# Print the results
print("Final Model Performance on Validation Set:")
print(f"Mean Squared Error (MSE): {final_mse}")
print(f"R² Score: {final_r2}")

In [None]:
X_test = test[features]
predictions = final_model.predict(X_test)

In [None]:
df = pd.DataFrame({
    'id':test['id'],
    'premium amount': np.expm1(predictions)
})

In [None]:
df

In [None]:
df.to_csv('predictions.csv', index = False)