In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e12/sample_submission.csv
/kaggle/input/playground-series-s4e12/train.csv
/kaggle/input/playground-series-s4e12/test.csv


In [2]:
train_data=pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv')

In [4]:
# Feature engineering implementation

# 1. Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_log_error
from sklearn.cluster import KMeans
from datetime import datetime

# Copy the dataset
data = train_data.copy()

# 2. Create new features

# (1) Annual income per dependent
# Calculate the annual income divided by the number of dependents to get the income per dependent.
data['Income_per_Dependent'] = data['Annual Income'] / (data['Number of Dependents'] + 1)

# (2) Income categories
# Categorize annual income into Low (< 30000), Medium (30000-60000), and High (>60000).
data['Income_Category'] = pd.cut(data['Annual Income'], bins=[0, 30000, 60000, np.inf], labels=['Low', 'Medium', 'High'])

# (3) Health score per age
# Standardize the health score by age to evaluate health relative to age.
data['Health_per_Age'] = data['Health Score'] / data['Age']

# (4) Family type based on marital status and number of dependents
# Combine marital status and the number of dependents to create a family type category.
data['Family_Type'] = data['Marital Status'] + "_" + data['Number of Dependents'].fillna(0).astype(int).astype(str)

# (5) Insurance duration categories
# Group insurance duration into three categories: Short (0-3), Medium (3-7), and Long (>7).
data['Insurance_Duration_Category'] = pd.cut(data['Insurance Duration'], bins=[0, 3, 7, np.inf], labels=['Short', 'Medium', 'Long'])

# (6) Lifestyle index combining exercise frequency and smoking status
# Combine exercise frequency and smoking status into a lifestyle index.
data['Lifestyle_Index'] = data['Exercise Frequency'].map({'Rarely': 1, 'Monthly': 2, 'Daily': 3}) * (data['Smoking Status'] == 'No').astype(int)

# (7) Regional indicators (one-hot encoding for location)
# Create dummy variables for location to encode regional information.
data = pd.get_dummies(data, columns=['Location'], prefix='Location')

# (8) Extract time of policy start from 'Policy Start Date'
# Extract the hour from the policy start date to analyze contract timing.
def extract_hour(date_str):
    try:
        # Extract the hour from datetime string
        return int(date_str.split(' ')[1].split(':')[0])
    except (IndexError, ValueError):
        # Return a default value (e.g., -1) for invalid formats
        return -1

# Apply the function to extract hour
data['Policy_Hour'] = data['Policy Start Date'].apply(extract_hour)

# (9) Customer grouping using PCA and clustering
# Select numeric columns and scale them
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data[numeric_cols].fillna(0))

# K-means clustering
# Apply K-means clustering to group customers into five clusters.
kmeans = KMeans(n_clusters=5, random_state=42)
data['Customer_Group'] = kmeans.fit_predict(scaled_data)

# (10) Interaction terms for health and exercise
# Create interaction terms to evaluate the combined effect of health and exercise.
data['Health_Exercise_Interaction'] = data['Health Score'] * data['Exercise Frequency'].map({'Rarely': 1, 'Monthly': 2, 'Daily': 3})

# 3. Prepare data for CatBoost
# Define the target variable and features
data = data.dropna(subset=['Premium Amount'])  # Drop rows where target is missing
X = data.drop(columns=['id', 'Premium Amount', 'Policy Start Date'])  # Exclude unnecessary columns
y = data['Premium Amount']

# Handle missing values in categorical features by replacing NaN with 'Unknown'
for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].fillna('Unknown')

# Ensure all categorical columns are strings and properly encoded for CatBoost
for col in X.select_dtypes(include=['category', 'object']).columns:
    X[col] = X[col].astype(str)

# Identify categorical features for CatBoost
categorical_features = [col for col in X.columns if X[col].dtype == 'object']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create CatBoost Pool objects
train_pool = Pool(X_train, y_train, cat_features=categorical_features)
test_pool = Pool(X_test, y_test, cat_features=categorical_features)

# 4. Train CatBoost Regressor
catboost_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    random_seed=42,
    verbose=100
)

catboost_model.fit(train_pool)

# 5. Evaluate the model using RMSLE
y_pred = catboost_model.predict(test_pool)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
print(f"RMSLE: {rmsle}")

# Save feature importance
feature_importances = catboost_model.get_feature_importance()
important_features = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
important_features.sort_values(by='Importance', ascending=False, inplace=True)




0:	learn: 863.8426139	total: 966ms	remaining: 16m 5s
100:	learn: 845.4080936	total: 1m 24s	remaining: 12m 28s


KeyboardInterrupt: 

In [5]:
important_features.head(10)

NameError: name 'important_features' is not defined

In [9]:
test=pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv')

In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    800000 non-null  int64  
 1   Age                   787511 non-null  float64
 2   Gender                800000 non-null  object 
 3   Annual Income         770140 non-null  float64
 4   Marital Status        787664 non-null  object 
 5   Number of Dependents  726870 non-null  float64
 6   Education Level       800000 non-null  object 
 7   Occupation            560875 non-null  object 
 8   Health Score          750551 non-null  float64
 9   Location              800000 non-null  object 
 10  Policy Type           800000 non-null  object 
 11  Previous Claims       557198 non-null  float64
 12  Vehicle Age           799997 non-null  float64
 13  Credit Score          708549 non-null  float64
 14  Insurance Duration    799998 non-null  float64
 15  

In [16]:
# (1) Annual income per dependent
# Calculate the annual income divided by the number of dependents to get the income per dependent.
test['Income_per_Dependent'] = test['Annual Income'] / (test['Number of Dependents'] + 1)

# (2) Income categories
# Categorize annual income into Low (< 30000), Medium (30000-60000), and High (>60000).
test['Income_Category'] = pd.cut(test['Annual Income'], bins=[0, 30000, 60000, np.inf], labels=['Low', 'Medium', 'High'])

# (3) Health score per age
# Standardize the health score by age to evaluate health relative to age.
test['Health_per_Age'] = test['Health Score'] / test['Age']

# (4) Family type based on marital status and number of dependents
# Combine marital status and the number of dependents to create a family type category.
test['Family_Type'] = test['Marital Status'] + "_" + test['Number of Dependents'].fillna(0).astype(int).astype(str)

# (5) Insurance duration categories
# Group insurance duration into three categories: Short (0-3), Medium (3-7), and Long (>7).
test['Insurance_Duration_Category'] = pd.cut(test['Insurance Duration'], bins=[0, 3, 7, np.inf], labels=['Short', 'Medium', 'Long'])

# (6) Lifestyle index combining exercise frequency and smoking status
# Combine exercise frequency and smoking status into a lifestyle index.
test['Lifestyle_Index'] = test['Exercise Frequency'].map({'Rarely': 1, 'Monthly': 2, 'Daily': 3}) * (test['Smoking Status'] == 'No').astype(int)

# (7) Regional indicators (one-hot encoding for location)
# Create dummy variables for location to encode regional information.
test = pd.get_dummies(test, columns=['Location'], prefix='Location')

# (8) Extract time of policy start from 'Policy Start Date'
# Extract the hour from the policy start date to analyze contract timing.
def extract_hour(date_str):
    try:
        # Extract the hour from datetime string
        return int(date_str.split(' ')[1].split(':')[0])
    except (IndexError, ValueError):
        # Return a default value (e.g., -1) for invalid formats
        return -1

# Apply the function to extract hour
test['Policy_Hour'] = test['Policy Start Date'].apply(extract_hour)

# (9) Customer grouping using PCA and clustering
# Select numeric columns and scale them
numeric_cols = test.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
scaled_data = scaler.fit_transform(test[numeric_cols].fillna(0))  # Correctly reference the data using test[numeric_cols]

# K-means clustering
# Apply K-means clustering to group customers into five clusters.
kmeans = KMeans(n_clusters=5, random_state=42)
test['Customer_Group'] = kmeans.fit_predict(scaled_data)

# (10) Interaction terms for health and exercise
# Create interaction terms to evaluate the combined effect of health and exercise.
test['Health_Exercise_Interaction'] = test['Health Score'] * test['Exercise Frequency'].map({'Rarely': 1, 'Monthly': 2, 'Daily': 3})



In [15]:
# Exclude unnecessary columns
test_X = test.drop(columns=['id', 'Policy Start Date'], errors='ignore')

# Handle missing values in categorical features by replacing NaN with 'Unknown'
for col in test_X.select_dtypes(include=['object']).columns:
    test_X[col] = test_X[col].fillna('Unknown').astype(str)

# Handle missing values in numerical features by replacing NaN with 0
for col in test_X.select_dtypes(include=['float64', 'int64']).columns:
    test_X[col] = test_X[col].fillna(0)

# Ensure test_X has the same features as the training set
missing_cols = set(X_train.columns) - set(test_X.columns)
for col in missing_cols:
    test_X[col] = 0  # Add missing columns with default value 0
test_X = test_X[X_train.columns]

# Ensure categorical columns are properly encoded
for col in categorical_features:
    if col in test_X.columns:
        test_X[col] = test_X[col].astype(str)

# Predict
try:
    premium_predictions = catboost_model.predict(test_X)
except Exception as e:
    print(f"Error during prediction: {e}")
    raise

# Add predictions to the test dataset
test['Predicted_Premium_Amount'] = premium_predictions

Error during prediction: There is no trained model to use predict(). Use fit() to train model. Then use this method.


CatBoostError: There is no trained model to use predict(). Use fit() to train model. Then use this method.

In [10]:
submission=pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv')

In [11]:
submission.head()

Unnamed: 0,id,Premium Amount
0,1200000,1102.545
1,1200001,1102.545
2,1200002,1102.545
3,1200003,1102.545
4,1200004,1102.545


In [12]:
submission['Premium Amount']=premium_predictions

NameError: name 'premium_predictions' is not defined

In [14]:
submission.to_csv('/kaggle/working/submission.csv',index=False)