In [1]:
import pandas as pd
X = pd.read_csv('train_data.csv')
X_test = pd.read_csv('test_data.csv')

In [2]:
X.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053,0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053,0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053,0
3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.,1053,0
4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3,1053,0


In [3]:
print(X.shape)
print(X_test.shape)

(7160, 14)
(3069, 13)


In [4]:

# Number of missing values in each column of training data
missing_val_count_by_column = (X.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Garden                  7
Building Dimension    106
Date_of_Occupancy     508
Geo_Code              102
dtype: int64


In [5]:
from sklearn.impute import SimpleImputer


In [6]:
X.groupby('Claim').mean()

Unnamed: 0_level_0,YearOfObservation,Insured_Period,Residential,Building Dimension,Building_Type,Date_of_Occupancy
Claim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2013.681868,0.898193,0.28954,1514.948152,2.128664,1964.078906
1,2013.627907,0.94887,0.359241,3125.703406,2.380049,1965.718016


In [7]:
y = X['Claim']
X.drop(['Claim'], axis=1, inplace=True)

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


In [9]:
categorical_cols = ['Building_Painted','Building_Fenced','Garden','Settlement']

In [10]:
numerical_cols = ['Insured_Period','Residential','Building Dimension','Building_Type']

In [11]:
numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [12]:
from xgboost import XGBClassifier

In [13]:
model = XGBClassifier(n_estimators=800, learning_rate=0.5)

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

In [14]:
from sklearn.model_selection import train_test_split


In [15]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

#fitting the model and using it to predict X_test
clf.fit(X_train, y_train)
pred_test = clf.predict(X_valid)



In [16]:
from sklearn.metrics import roc_auc_score

In [17]:
 print("Model Accuracy: ", roc_auc_score(y_valid, pred_test))

Model Accuracy:  0.6012413687572923


In [18]:
clf.fit(X, y)
predictions = clf.predict(X_test)

In [19]:
output = pd.DataFrame()
output['Customer Id'] = X_test['Customer Id']
output['Claim'] = predictions

output.to_csv('submission8.csv', index=False)
output.head()

Unnamed: 0,Customer Id,Claim
0,H11920,0
1,H11921,0
2,H9805,0
3,H7493,0
4,H7494,0
