## Table of contents:

1. [Importing libraries](#Libraries)
2. [Loading data](#Data)
3. [Feature engineering](#Engineering)
4. [Cross validation](#CrossValidation)
5. [Modelling](#Modelling)
6. [What i didn't do ](#mistakes)
7. [what hindered me](#hinder)

<a name="Libraries"></a>
## 1. Importing libraries


In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
pd.options.display.float_format = '{:.5f}'.format

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

<a name="Data"></a>
## 2. Loading data

In [2]:
# Load files
train = pd.read_csv('train.csv')
policies = pd.read_csv('policies.csv')
test = pd.read_csv('test.csv')
samplesubmission = pd.read_csv('SampleSubmission.csv')

# Preview train dataset
train.head()

Unnamed: 0,claim_id,policy_number,claim_number,sum_insured,product,agent,class_of_business,risk_type,loss_date,client_type,renewal_frequency,primary_cause,secondary_cause,branch,target
0,ID_JZ7QWRHH57RO,pol000009680,claim0011359,300000.0,prod00027,ag00068,cob00031,rt00006,2021-05-13,ct0003,rf0001,pc0007,sc00022,br00006,82150.0
1,ID_KQ5TJMVLJ4EP,pol000006869,claim0005272,16000.0,prod00005,ag00226,cob00010,rt00032,2018-05-13,ct0003,rf0001,pc0007,sc00023,br00002,3780.24
2,ID_NYHI7WJGGIAE,pol000006135,claim0004147,3000.0,prod00029,ag00037,cob00010,rt00026,2018-08-18,ct0003,rf0001,pc0007,sc00024,br00002,2196.5
3,ID_ROMMDCXYUXN5,pol000008991,claim0009962,6000.0,prod00005,ag00037,cob00009,rt00032,2018-12-20,ct0003,rf0001,pc0007,sc00021,br00001,105.67
4,ID_2OTD9NX8L73D,pol000007768,claim0007346,26850.0,prod00015,ag00226,cob00034,rt00023,2018-11-07,ct0001,rf0001,pc0007,sc00021,br00001,1605.64


# 3. Feature enginnering #

In [3]:
#merging the train and test data for easy fetaure engineering 
train['source']=0
test['source']=1
all_data=pd.concat([train,test])

In [4]:
#list of categorical columns
cat_columns= ['policy_number','product','agent','class_of_business','risk_type','client_type','renewal_frequency', 'primary_cause', 'secondary_cause', 'branch']

In [5]:
# target encoding 
for cat in cat_columns:
  means=train.groupby(cat).mean().target
  df=pd.DataFrame({'encoded'+cat:means.values,cat:means.index.values.tolist()})
  all_data=pd.merge(all_data,df,how='left', on=[cat])

In [6]:
# one hot encoding my categorical columns
all_data=pd.get_dummies(all_data,columns=cat_columns)


### Generating Datetime features

In [7]:
# Convert datetime variables to datetime objects
all_data.loss_date = pd.to_datetime(all_data.loss_date, errors='coerce')
all_data['loss_date_day_of_week'] = all_data.loss_date.dt.dayofweek
all_data['loss_date_month'] = all_data.loss_date.dt.month
all_data['loss_date_year'] = all_data.loss_date.dt.year
all_data.shape

(13395, 5484)

##### As we can see above our data now has 5400+ column, which is way too much, this occurred as a result of our one hot encoding, because the categorical columns has too many unique values, that was i used target encoding for all cat_columns and will now drop some of the columns with low standard deviation. 

In [8]:
# dropping columns with low standard deviations
to_be_dropped=[]
train_feats = all_data.columns.difference(['claim_id', 'loss_date', 'target','claim_number']).tolist()
for col in train_feats:
  if all_data[col].std()<0.1:
    to_be_dropped.append(col)

In [9]:
all_data.drop(to_be_dropped,axis=1,inplace=True)
#filling missing data with -999
all_data.fillna(-999,inplace=True)

In [10]:
#selecting my train features
train_feats = all_data.columns.difference(['claim_id', 'loss_date', 'target','claim_number','source']).tolist()
#splitting my data back into train and test 
train_df=all_data.loc[all_data['source']==0]
test_df=all_data.loc[all_data['source']==1]
test_df=test_df[train_feats]

In [11]:
# Select main columns to be used in training
X = train_df[train_feats]
y = train_df.target
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0)

<a name="CrossValidation"></a>
## 4.  Cross validation 

In [12]:
# my cross validation function
from sklearn.model_selection import KFold,GroupKFold
metric=mean_absolute_error
def cv(model,n_splits=5):
    test_preds=np.zeros([test_df.shape[0],n_splits])
    error=np.zeros(n_splits)
    kf=KFold(n_splits=n_splits,shuffle=True,random_state=12)
    oof_val_preds=np.zeros(X.shape[0])
    for index,(train_indices,test_indices) in enumerate(kf.split(X,y)):
        train_x,test_x=X.values[train_indices],X.values[test_indices]
        train_y,test_y=y.iloc[train_indices],y.iloc[test_indices]
        model.fit(train_x,train_y)
        score=metric(test_y,model.predict(test_x))
        error[index]=score
        oof_val_preds[test_indices]=model.predict(test_x)
        print(str(index+1)+'fold :'+str(score))
        print(str(index+1)+'average_fold :'+str(np.mean(error[:index+1])))
        test_preds[:,index]=model.predict(test_df)
    predictions=np.mean(test_preds,axis=1)
    print('oof_score :  '+str(metric(y,oof_val_preds)))
    return predictions

<a name="Modelling"></a>
## 5.  Modelling

In [13]:
from xgboost import XGBRegressor
xgb=XGBRegressor(n_estimators=300,max_depth=3,random_state=12,
                 learning_rate=0.08,colsample_bytree=0.8,subsample=0.8)
xgb_preds=cv(xgb)

1fold :32804.88222118231
1average_fold :32804.88222118231
2fold :31581.77449265084
2average_fold :32193.328356916576
3fold :39344.767291681805
3average_fold :34577.141335171655
4fold :35917.20620992742
4average_fold :34912.1575538606
5fold :40439.9914099728
5average_fold :36017.724325083036
oof_score :  36017.381658476384


In [14]:
from catboost import CatBoostRegressor
cat= CatBoostRegressor( loss_function= 'MAE',
learning_rate= 0.08,
iterations= 650,                       
depth= 3,
subsample=0.8,
bootstrap_type= 'Bernoulli',
colsample_bylevel=0.8,
random_seed= 42,
verbose= 200 )
cat_preds=cv(cat)

0:	learn: 41560.3072617	total: 89ms	remaining: 57.8s
200:	learn: 30972.9730829	total: 823ms	remaining: 1.84s
400:	learn: 28298.7821963	total: 1.5s	remaining: 930ms
600:	learn: 27632.4601833	total: 2.3s	remaining: 187ms
649:	learn: 27545.3269995	total: 2.45s	remaining: 0us
1fold :27543.29831117191
1average_fold :27543.29831117191
0:	learn: 42588.6884071	total: 9.37ms	remaining: 6.08s
200:	learn: 31214.8770063	total: 741ms	remaining: 1.66s
400:	learn: 29206.0403945	total: 1.41s	remaining: 877ms
600:	learn: 28625.1906629	total: 2.19s	remaining: 179ms
649:	learn: 28583.3124837	total: 2.34s	remaining: 0us
2fold :23483.30051791994
2average_fold :25513.299414545923
0:	learn: 39308.9529647	total: 9.08ms	remaining: 5.89s
200:	learn: 29365.1063138	total: 721ms	remaining: 1.61s
400:	learn: 26990.7928336	total: 1.49s	remaining: 926ms
600:	learn: 26437.8380619	total: 2.13s	remaining: 173ms
649:	learn: 26367.6222036	total: 2.27s	remaining: 0us
3fold :32554.079854610805
3average_fold :27860.226227900

<a name="Predictions"></a>
## 6. Making predictions of the test set and creating a submission file

In [15]:
# emsembling our models with weighted voting 
predictions = np.array(cat_preds)*0.6+np.array(xgb_preds)*0.4
# # Create a submission file
sub_file = samplesubmission.copy()
#clipping the predictions to remove outliers in our predictions
sub_file.target = np.clip(predictions,0,10000000)
#generating our submission file
sub_file.to_csv("ZimnatInsurance.csv", index = False)

<a name="mistakes"></a>
## 7. What I didn't do ##
 - Use more data - policies data
 - Thorough EDA and domain knowledge sourcing
 - trust my local validation other than the public leaderboard
 - division of labour amongst my teammates


<a name="hinder"></a>
## 8. What hindered me  ##
 - erratic power supply
 - erratic power supply
 - poor internet connectivity

### Thanks for you time, you can contact me if you have any questions or suggestions