In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e12/sample_submission.csv
/kaggle/input/playground-series-s4e12/train.csv
/kaggle/input/playground-series-s4e12/test.csv


# 1) Data Cleaning

## configuring training and test data 

In [2]:
train_data = pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv')
y = train_data['Premium Amount']
train_data = train_data.drop(['id', 'Premium Amount'], axis=1)

test_data = pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv')
test_ids = test_data['id']
test_data = test_data.drop(['id'], axis=1)

# function to check for NaN values in a specified column
def checknan(colname):
    return train_data[train_data[colname].isna()]

# function to fillna with either mean, median, 0, or another column
def fillNaN(colnamelist, type, othercol=None, datasetlist=[train_data, test_data]):
    if type == 'mean':
        for dataset in datasetlist:
            for colname in colnamelist:
                dataset.fillna({colname: train_data[colname].mean()}, inplace=True) # Version14: adding test_data to the fillNaN function
    elif type == 'median':
        for dataset in datasetlist:
            for colname in colnamelist:
                dataset.fillna({colname: train_data[colname].median()}, inplace=True)
    elif type == 'mode':
        for dataset in datasetlist:
            for colname in colnamelist:
                dataset.fillna({colname: train_data[colname].mode()[0]}, inplace=True)
    elif type == 0:
        for dataset in datasetlist:
            for colname in colnamelist:
                dataset.fillna({colname: 0}, inplace=True)
    elif type == 'othercol':
        for dataset in datasetlist:
            for colname in colnamelist:
                dataset.fillna({colname: train_data[othercol]}, inplace=True)
    print(f'fillNaN [{type}] complete')

# nan_cols = train_data.columns[train_data.isna().any()]
# for col in nan_cols:
#     print(f'- {col}')

meanfillna = ['Age', 'Number of Dependents', 'Health Score']
medianfillna = ['Annual Income', 'Vehicle Age', 'Credit Score', 'Insurance Duration']
modefillna = ['Marital Status', 'Occupation', 'Previous Claims', 'Customer Feedback']

fillNaN(meanfillna, 'mean') # fillna for columns using mean
fillNaN(medianfillna, 'median') # fillna for columns using median
fillNaN(modefillna, 'mode') # fillna for columns using mode

# processing the values in the 'Policy Start Date', 'Gender', 'Customer Feedback', and 'Smoking Status' columns
# starting with 'Policy Start Date'
train_data['Policy Start Date'] = pd.to_datetime(train_data['Policy Start Date'])
train_data['start year'] = train_data['Policy Start Date'].dt.year
train_data['start month'] = train_data['Policy Start Date'].dt.month
train_data = train_data.drop(['Policy Start Date'], axis=1)

test_data['Policy Start Date'] = pd.to_datetime(test_data['Policy Start Date'])
test_data['start year'] = test_data['Policy Start Date'].dt.year
test_data['start month'] = test_data['Policy Start Date'].dt.month
test_data = test_data.drop(['Policy Start Date'], axis=1)

monthmap = {
    1: 'Jan',
    2: 'Feb',
    3: 'Mar',
    4: 'Apr',
    5: 'May',
    6: 'Jun',
    7: 'Jul',
    8: 'Aug',
    9: 'Sep',
    10: 'Oct',
    11: 'Nov',
    12: 'Dec'
}

train_data['start month'] = train_data['start month'].map(monthmap)
test_data['start month'] = test_data['start month'].map(monthmap)

# on to 'Gender'
gendermap = {'Male': 1, 'Female': 0}
train_data['Gender'] = train_data['Gender'].map(gendermap)
test_data['Gender'] = test_data['Gender'].map(gendermap)

# Customer Feedback
feedbackmap = {'Poor': 1, 'Average': 2, 'Good': 3}
train_data['Customer Feedback'] = train_data['Customer Feedback'].map(feedbackmap)
test_data['Customer Feedback'] = test_data['Customer Feedback'].map(feedbackmap)

# Smoking Status
smokingmap = {'Yes': 1, 'No': 0}
train_data['Smoking Status'] = train_data['Smoking Status'].map(smokingmap)
test_data['Smoking Status'] = test_data['Smoking Status'].map(smokingmap)
print(len(train_data))
print(len(test_data))

# prepping data to go into machine learning model
from sklearn.model_selection import train_test_split
train_data = pd.get_dummies(train_data)
# splitting train_data into train and mock test data
X_train, X_mocktest, y_train, y_mocktest = train_test_split(train_data, y, test_size=0.4, random_state=0)
print(X_train.head())
print(len(X_train))

print(pd.concat([train_data.nunique(), test_data.nunique()], axis=1))

fillNaN [mean] complete
fillNaN [median] complete
fillNaN [mode] complete
1200000
800000
          Age  Gender  Annual Income  Number of Dependents  Health Score  \
920871   57.0       1        58090.0              2.009934     19.937223   
1037581  64.0       1        19768.0              1.000000     12.836510   
841096   38.0       1          617.0              1.000000     41.612676   
640872   25.0       0        16155.0              2.009934     16.098472   
295534   20.0       1        31028.0              0.000000     20.848113   

         Previous Claims  Vehicle Age  Credit Score  Insurance Duration  \
920871               0.0          5.0         469.0                 7.0   
1037581              2.0         18.0         335.0                 8.0   
841096               0.0         18.0         495.0                 8.0   
640872               2.0         12.0         823.0                 2.0   
295534               0.0         19.0         748.0                 5.0   

   

# 2) Model Configuration

## 1. Linear Regressor

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error as msle

lrmodel = LinearRegression()
lrmodel.fit(X_train, y_train)
mockpreds = lrmodel.predict(X_mocktest)

def rmsle(actual_y, predicted_y):
    return np.sqrt(msle(actual_y, predicted_y))

print(rmsle(y_mocktest, mockpreds))

# deleting objects after use to prevent ram overload
del mockpreds, X_train, X_mocktest, y_train, y_mocktest

1.1691549763145526


## 2. Deep Neural Network with TensorFlow (tbc)

In [4]:
import tensorflow as tf

# Submission

In [5]:
lrmodel.fit(train_data, y)
# prepping test_data for the model
test_data = pd.get_dummies(test_data)
preds = lrmodel.predict(test_data)

# creating submission Dataframe
output = pd.DataFrame({'id': test_ids, 'Premium Amount': preds})
print(output)
output.to_csv('submission.csv', index=False)
print('success yahuu')

             id  Premium Amount
0       1200000     1068.111236
1       1200001     1035.583344
2       1200002     1024.444304
3       1200003     1064.946453
4       1200004     1052.928415
...         ...             ...
799995  1999995     1099.988177
799996  1999996     1110.076971
799997  1999997     1082.989758
799998  1999998     1153.321974
799999  1999999     1069.412769

[800000 rows x 2 columns]
success yahuu
