# Imports

In [58]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor

# Read Data

In [5]:
test_data = pd.read_csv(filepath_or_buffer='data/test.csv', nrows=100)
train_data = pd.read_csv(filepath_or_buffer='data/train.csv', nrows=100)

# Data Preprocessing/Transformations

## Explore

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    100 non-null    int64  
 1   Age                   99 non-null     float64
 2   Gender                100 non-null    object 
 3   Annual Income         94 non-null     float64
 4   Marital Status        99 non-null     object 
 5   Number of Dependents  92 non-null     float64
 6   Education Level       100 non-null    object 
 7   Occupation            65 non-null     object 
 8   Health Score          94 non-null     float64
 9   Location              100 non-null    object 
 10  Policy Type           100 non-null    object 
 11  Previous Claims       71 non-null     float64
 12  Vehicle Age           100 non-null    float64
 13  Credit Score          86 non-null     float64
 14  Insurance Duration    100 non-null    float64
 15  Policy Start Date     10

In [13]:
def cleanse_column_names(data):
    new_cols = []
    for col in data.columns:
        new_cols.append(col.lower().replace(' ', '_'))
    data.columns = new_cols
    return data.columns

In [41]:
cleanse_column_names(test_data)

Index(['id', 'age', 'gender', 'annual_income', 'marital_status',
       'number_of_dependents', 'education_level', 'occupation', 'health_score',
       'location', 'policy_type', 'previous_claims', 'vehicle_age',
       'credit_score', 'insurance_duration', 'policy_start_date',
       'customer_feedback', 'smoking_status', 'exercise_frequency',
       'property_type'],
      dtype='object')

In [78]:
def create_pipeline(numeric_features=None, ordinal_features=None, ordinal_categories=None, category_features=None):
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())   
    ])

    ordinal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OrdinalEncoder(categories=[ordinal_categories['exercise_frequency']], handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    category_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ])

    # date_transformer = Pipeline(steps=[
        
    # ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('numeric_features', numeric_transformer, numeric_features),
            ('ordinal_features', ordinal_transformer, ordinal_features),
            ('category_features', category_transformer, category_features)
        ])

    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LGBMRegressor())
    ])
    
    return full_pipeline

In [60]:
train_data.drop('id', axis=1, inplace=True)
# test_data.drop('id', axis=1, inplace=True)

target = 'premium_amount'
X = train_data.drop(target, axis=1)
y = train_data[target]

y_log = np.log1p(y)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.20, random_state=42)

In [79]:
numeric_features = X.select_dtypes(include='number').columns

category_features = [
    'marital_status',
    'occupation',
    'location',
    'property_type',
    'education_level',
    'policy_type',
    'customer_feedback',
    'gender',
    'smoking_status'
]

ordinal_features = ['exercise_frequency']

ordinal_categories = {
    'exercise_frequency': ['Rarely', 'Monthly', 'Weekly', 'Daily']
}

# date_features = ['policy_start_date']

## Predict

In [96]:
pipeline = create_pipeline(
    numeric_features=numeric_features, 
    ordinal_features=ordinal_features,
    ordinal_categories=ordinal_categories,
    category_features=category_features
    )

pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035750 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 176
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 24
[LightGBM] [Info] Start training from score 6.728474




In [97]:
y_pred

array([1180.82803303, 1343.24117961,  719.01016449,  602.09315287,
       1147.34713159, 1786.7824873 , 1864.15753803, 1297.62551329,
       1902.16561026, 1430.9672096 ,  928.46568673,  668.1018238 ,
       1293.96045544, 2061.61557342, 1318.09752964, 2998.66998756,
       2355.83954114,  742.77799442, 1440.53719903,  488.69475879])

# Submission

In [82]:
sample_submission = pd.read_csv('data/sample_submission.csv')

In [98]:
sample_submission['Premium Amount'][:20] = y_pred
sample_submission.to_csv('data/submission.csv', index=False)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  sample_submission['Premium Amount'][:20] = y_pred
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_submis