<a name="Libraries"></a>
## 1. Importing libraries


In [None]:
!pip install category_encoders



In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
pd.options.display.float_format = '{:.5f}'.format

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


<a name="Data"></a>
## 2. Loading data

In [None]:
# Load files
train = pd.read_csv('/content/drive/MyDrive/machine learning competitions/umojahack-africa-2022-intermediate-challenge/train.csv')
policies = pd.read_csv('/content/drive/MyDrive/machine learning competitions/umojahack-africa-2022-intermediate-challenge/policies.csv')
test = pd.read_csv('/content/drive/MyDrive/machine learning competitions/umojahack-africa-2022-intermediate-challenge/test.csv')
samplesubmission = pd.read_csv('/content/drive/MyDrive/machine learning competitions/umojahack-africa-2022-intermediate-challenge/SampleSubmission.csv')

# Preview train dataset
train.head()

Unnamed: 0,claim_id,policy_number,claim_number,sum_insured,product,agent,class_of_business,risk_type,loss_date,client_type,renewal_frequency,primary_cause,secondary_cause,branch,target
0,ID_JZ7QWRHH57RO,pol000009680,claim0011359,300000.0,prod00027,ag00068,cob00031,rt00006,2021-05-13,ct0003,rf0001,pc0007,sc00022,br00006,82150.0
1,ID_KQ5TJMVLJ4EP,pol000006869,claim0005272,16000.0,prod00005,ag00226,cob00010,rt00032,2018-05-13,ct0003,rf0001,pc0007,sc00023,br00002,3780.24
2,ID_NYHI7WJGGIAE,pol000006135,claim0004147,3000.0,prod00029,ag00037,cob00010,rt00026,2018-08-18,ct0003,rf0001,pc0007,sc00024,br00002,2196.5
3,ID_ROMMDCXYUXN5,pol000008991,claim0009962,6000.0,prod00005,ag00037,cob00009,rt00032,2018-12-20,ct0003,rf0001,pc0007,sc00021,br00001,105.67
4,ID_2OTD9NX8L73D,pol000007768,claim0007346,26850.0,prod00015,ag00226,cob00034,rt00023,2018-11-07,ct0001,rf0001,pc0007,sc00021,br00001,1605.64


In [None]:
train['source']=0
test['source']=1
all_data=pd.concat([train,test])

In [None]:
cat_columns= ['policy_number','product','agent','class_of_business','risk_type','client_type','renewal_frequency', 'primary_cause', 'secondary_cause', 'branch']

In [None]:
for cat in cat_columns:
  means=train.groupby(cat).mean().target
  df=pd.DataFrame({'encoded'+cat:means.values,cat:means.index.values.tolist()})
  all_data=pd.merge(all_data,df,how='left', on=[cat])

In [None]:
all_data=pd.get_dummies(all_data,columns=cat_columns)

Outliers are those data points which differs significantly from other observations present in given dataset.

Suggestions on how to handle outliers:
 - Transforming the outliers by scaling - log transformation, box-cox transformation ...
 - Dropping outliers
 - Imputation by replacing outliers with mean, median ...


<a name="Datatypes"></a>
## 6. Dataset datatypes

In [None]:
# Convert datetime variables to datetime objects
all_data.loss_date = pd.to_datetime(all_data.loss_date, errors='coerce')

In [None]:
# Extract day, month and year from the loss date column
# day
all_data['loss_date_day_of_week'] = all_data.loss_date.dt.dayofweek

# month
all_data['loss_date_month'] = all_data.loss_date.dt.month
 
# year
all_data['loss_date_year'] = all_data.loss_date.dt.year

In [None]:
all_data.select_dtypes(object)

Unnamed: 0,claim_id,claim_number
0,ID_JZ7QWRHH57RO,claim0011359
1,ID_KQ5TJMVLJ4EP,claim0005272
2,ID_NYHI7WJGGIAE,claim0004147
3,ID_ROMMDCXYUXN5,claim0009962
4,ID_2OTD9NX8L73D,claim0007346
...,...,...
13390,ID_F08UFUIVVMR8,claim0004124
13391,ID_KODD3CK5SV0X,claim0014092
13392,ID_59003DPSZQ16,claim0003914
13393,ID_B0C9XF5SIHGO,claim0010854


In [None]:
to_be_dropped=[]
train_feats = all_data.columns.difference(['claim_id', 'loss_date', 'target','claim_number']).tolist()
for col in train_feats:
  if all_data[col].std()<0.1:
    to_be_dropped.append(col)

In [None]:
all_data.drop(to_be_dropped,axis=1,inplace=True)
all_data.fillna(-999,inplace=True)

In [None]:
# Convert categorical variables to categorical objects
train_feats = all_data.columns.difference(['claim_id', 'loss_date', 'target','claim_number','source']).tolist()
train=all_data.loc[all_data['source']==0]
test=all_data.loc[all_data['source']==1]

<a name="Dates"></a>
## 8. Date features EDA

<a name="Modelling"></a>
## 12.  Modelling

In [None]:
# Select main columns to be used in training
X = train[train_feats]
y = train.target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0)

In [None]:
from sklearn.model_selection import KFold,GroupKFold
import statistics
def cv(model,n_splits=5):
    pres=[]
    error=[]
    skf=KFold(n_splits=n_splits,shuffle=True,random_state=12)
    i=0
    main=train_feats
    oof_preds=np.zeros(X.shape[0])
    oof_y=np.zeros(X.shape[0])
   
    for train_indices,test_indices in skf.split(X,y):
        i=i+1
        train_x,test_x=X.values[train_indices],X.values[test_indices]
        train_y,test_y=y.iloc[train_indices],y.iloc[test_indices]
        model.fit(train_x,train_y)
        score=mean_absolute_error(test_y,model.predict(test_x))
        error.append(score)
        oof_preds[test_indices]=model.predict(test_x)
        oof_y[test_indices]=test_y
        print(str(i)+'fold :'+str(score))
        print(str(i)+'average_fold :'+str(np.mean(error)))
        preds=model.predict(test_df[main].values)
        pres.append(preds)
    pres=np.array(pres)
    new_preds=[]
    pres=np.transpose(pres)
    for i in pres:
        new_preds.append(statistics.mean(i))
    predictions=new_preds
    print('oof_score'+str(mean_absolute_error(oof_y,oof_preds)))
    return predictions

In [None]:
from xgboost import XGBRegressor
xgb=XGBRegressor(n_estimators=300,max_depth=3,random_state=12,
                 learning_rate=0.08,colsample_bytree=0.8,subsample=0.8)
xgb.fit(X_train, y_train)

# Make predictions
y_pred = xgb.predict(X_test)
xgb_preds=cv(xgb)

# Check the MAE score of the model
print(f'LinearRegression MAE score on the X_test is: {mean_absolute_error(y_test, y_pred)}')

1fold :32553.492109781524
1average_fold :32553.492109781524


KeyError: ignored

In [None]:
!pip install catboost --q

In [None]:
from catboost import CatBoostRegressor
cat= CatBoostRegressor( loss_function= 'MAE',
learning_rate= 0.08,
iterations= 650,                       
depth= 3,
subsample=0.8,
bootstrap_type= 'Bernoulli',
colsample_bylevel=0.8,
random_seed= 42,
verbose= 100 )
cat.fit(X_train,y_train)
y_pred = cat.predict(X_test)
cat_preds=cv(cat)

# Check the MAE score of the model
print(f'LinearRegression MAE score on the X_test is: {mean_absolute_error(y_test, y_pred)}')

0:	learn: 41808.0317886	total: 2.36ms	remaining: 1.53s
100:	learn: 34880.3241174	total: 182ms	remaining: 989ms
200:	learn: 30530.5733730	total: 358ms	remaining: 799ms
300:	learn: 29326.9674776	total: 546ms	remaining: 633ms
400:	learn: 28748.4208453	total: 717ms	remaining: 445ms
500:	learn: 28329.9024956	total: 903ms	remaining: 268ms
600:	learn: 28011.4193925	total: 1.07s	remaining: 87.7ms
649:	learn: 27759.5066907	total: 1.16s	remaining: 0us
0:	learn: 41560.3072617	total: 2.1ms	remaining: 1.36s
100:	learn: 34647.7941780	total: 191ms	remaining: 1.04s
200:	learn: 30972.9730829	total: 397ms	remaining: 886ms
300:	learn: 29810.3169984	total: 588ms	remaining: 681ms
400:	learn: 28298.7821963	total: 780ms	remaining: 484ms
500:	learn: 27911.5797815	total: 970ms	remaining: 289ms
600:	learn: 27632.4601833	total: 1.16s	remaining: 94.2ms
649:	learn: 27545.3269995	total: 1.24s	remaining: 0us
1fold :27543.29831117191
1average_fold :27543.29831117191


KeyError: ignored

<a name="Predictions"></a>
## 13. Making predictions of the test set and creating a submission file

In [None]:
# Make prediction on the test set
#cat.fit(X,y)
test_df = test[train_feats]
predictions = np.array(cat_preds)*0.6+np.array(xgb_preds)*0.4

# # Create a submission file
sub_file = samplesubmission.copy()
sub_file.target = np.clip(predictions,0,10000000)

 - There are some outliers in our predictions. 
 - More inspection/data preprocessing is needed

In [None]:
# Write to csv file to submit to Zindi
sub_file.to_csv("0.25blendkfold.csv", index = False)

<a name="Tips"></a>
## 1. More Tips
- Use more data - policies data
- Thorough EDA and domain knowledge sourcing
- Re-group Categorical features 
- More Feature Engineering 
- Ensembl4ing of models 
- Cross-validation: Group folds, Stratified...

# ******************* HAPPY HACKING ***************************