In [1]:
# Importing libraries
#
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("train_data.csv")
data.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053,0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053,0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053,0
3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.,1053,0
4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3,1053,0


In [3]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
YearOfObservation,7160.0,2013.669553,1.383769,2012.0,2012.0,2013.0,2015.0,2016.0
Insured_Period,7160.0,0.909758,0.239756,0.0,0.997268,1.0,1.0,1.0
Residential,7160.0,0.305447,0.460629,0.0,0.0,0.0,1.0,1.0
Building Dimension,7054.0,1883.72753,2278.157745,1.0,528.0,1083.0,2289.75,20940.0
Building_Type,7160.0,2.186034,0.940632,1.0,2.0,2.0,3.0,4.0
Date_of_Occupancy,6652.0,1964.456404,36.002014,1545.0,1960.0,1970.0,1980.0,2016.0
Claim,7160.0,0.228212,0.419709,0.0,0.0,0.0,0.0,1.0


In [4]:
data.isnull().sum()

Customer Id             0
YearOfObservation       0
Insured_Period          0
Residential             0
Building_Painted        0
Building_Fenced         0
Garden                  7
Settlement              0
Building Dimension    106
Building_Type           0
Date_of_Occupancy     508
NumberOfWindows         0
Geo_Code              102
Claim                   0
dtype: int64

In [5]:
data = data.drop('NumberOfWindows', axis = 1)

In [6]:
empty = ['Garden', 'Building Dimension', 'Date_of_Occupancy', 'Geo_Code']
for col in empty:
    maxx = data[col].value_counts().argmax()
    emp = data[data[col].isnull()]
    for index, row in emp.iterrows():
        data.at[index, col] = maxx

In [7]:
data.columns

Index(['Customer Id', 'YearOfObservation', 'Insured_Period', 'Residential',
       'Building_Painted', 'Building_Fenced', 'Garden', 'Settlement',
       'Building Dimension', 'Building_Type', 'Date_of_Occupancy', 'Geo_Code',
       'Claim'],
      dtype='object')

In [8]:
categorical_features = ['Building_Painted', 'Building_Fenced', 'Garden', 'Settlement', 'Geo_Code', 'Claim']

data['Building_Painted'] = data.Building_Painted.astype('category')
data['Building_Fenced'] = data.Building_Fenced.astype('category')
data['Garden'] = data.Garden.astype('category')
data['Settlement'] = data.Settlement.astype('category')
data['Geo_Code'] = data.Geo_Code.astype('category')
data['Claim'] = data.Claim.astype('category')

In [9]:
for feature in categorical_features:
    data[feature] = data[feature].cat.codes

In [10]:
data.dtypes

Customer Id            object
YearOfObservation       int64
Insured_Period        float64
Residential             int64
Building_Painted         int8
Building_Fenced          int8
Garden                   int8
Settlement               int8
Building Dimension    float64
Building_Type           int64
Date_of_Occupancy     float64
Geo_Code                int16
Claim                    int8
dtype: object

In [22]:
data.head(10)

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,Geo_Code,Claim
0,H14663,2013,1.0,0,0,1,1,1,290.0,1,1960.0,7,0
1,H2037,2015,1.0,0,1,0,0,0,490.0,1,1850.0,7,0
2,H3802,2014,1.0,0,0,1,1,1,595.0,1,1960.0,7,0
3,H3834,2013,1.0,0,1,1,1,1,2840.0,1,1960.0,7,0
4,H5053,2014,1.0,0,1,0,0,0,680.0,1,1800.0,7,0
5,H4977,2012,1.0,0,1,0,0,0,535.0,1,1980.0,14,0
6,H7390,2012,1.0,0,0,1,1,1,2830.0,1,1988.0,14,0
7,H14488,2015,1.0,0,0,1,1,1,4952.0,1,1988.0,16,0
8,H19355,2014,1.0,0,1,0,0,0,2735.0,1,2013.0,17,1
9,H18601,2015,1.0,0,1,0,0,0,520.0,1,2011.0,26,0


In [23]:
# Using different models to find the optimal model
#
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error


from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Creating a list of regressor algorithms to compare with
#
models = [SVC(), DecisionTreeClassifier(), MLPClassifier(), KNeighborsClassifier(),\
          LogisticRegression()]


# Creating lists of the algorithms, to store the accuracy scores of each fold
#
svc, dt, mlp, knn, linear= ([] for x in range(5))


# Creating a list containig the list of each algorithm. Created for easy iteration
#
model_list = [svc, dt, mlp, knn, linear]


# Spliting the data into features and the target variable
#
X = data.drop(['Customer Id', 'Claim'], axis = 1)
y = data['Claim']


# Creating a cross validation of 10 folds
#
kfold  = KFold(n_splits=10, random_state=101)


# Iterating through each model and appending the scores of each fold to the appriopriate list
#
for i, j in zip(models, model_list):
  j.extend(list(cross_val_score(i, X, y, scoring = 'neg_mean_squared_error', cv = kfold)))

  
# Creating a function to convert neg_mean_squared_error to a square root
#
def sq(lis):
  new_lis = []
  lis = np.array(lis)
  for i in lis:
    i = np.sqrt(i*-1)
    new_lis.append(i)
  return new_lis


# Creating a dataframe of all the rmses from the iterations for each model
#
rmses = pd.DataFrame({'Fold': np.arange(1, 11), 'svc': sq(svc), 'dt': sq(dt), 'mlp': sq(mlp), 'knn': sq(knn),\
                       'linear': sq(linear)})

# Setting the index
#
rmses.set_index('Fold', inplace = True)


# Calculating the mean and standard deviation rmse of each algorithm
#
rmses.loc['mean'] = rmses.mean()
rmses.loc['std'] = rmses.std()


# Previewing the rmses dataframe
#
rmses

Unnamed: 0_level_0,svc,dt,mlp,knn,linear
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.524538,0.601441,0.528516,0.538983,0.478592
2,0.448461,0.542856,0.529836,0.472719,0.445336
3,0.445336,0.546702,0.45618,0.462263,0.442189
4,0.469756,0.532466,0.46075,0.453108,0.450016
5,0.437425,0.527193,0.451565,0.46075,0.419497
6,0.454647,0.542856,0.457709,0.506935,0.454647
7,0.469756,0.582568,0.468267,0.551787,0.451565
8,0.541568,0.61066,0.54925,0.54925,0.525867
9,0.459232,0.555571,0.833985,0.492967,0.450016
10,0.515134,0.638609,0.513777,0.521868,0.512416


In [24]:
rand = KNeighborsClassifier()
rand.fit(X, y)

aa = data[ :5 ]
aa = aa.drop(['Customer Id', 'Claim'], axis=1)

In [25]:
aa.head()

Unnamed: 0,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,Geo_Code
0,2013,1.0,0,0,1,1,1,290.0,1,1960.0,7
1,2015,1.0,0,1,0,0,0,490.0,1,1850.0,7
2,2014,1.0,0,0,1,1,1,595.0,1,1960.0,7
3,2013,1.0,0,1,1,1,1,2840.0,1,1960.0,7
4,2014,1.0,0,1,0,0,0,680.0,1,1800.0,7


In [26]:
rand.predict(aa)

array([0, 0, 0, 0, 0], dtype=int8)

In [15]:
rmses.loc['mean'].idxmin(), rmses.loc['mean'].min()

('GradientBoosting', 0.39562848618420454)

In [16]:
rmses.loc['mean'].sort_values()

GradientBoosting    0.395628
Linear              0.396909
Adaboost            0.399739
SVR                 0.422080
RandomForest        0.429911
Kneighbors          0.437646
DecisionTree        0.567980
Name: mean, dtype: float64

In [17]:
test = pd.read_csv("train_data.csv")
data.isnull().sum()

Customer Id           0
YearOfObservation     0
Insured_Period        0
Residential           0
Building_Painted      0
Building_Fenced       0
Garden                0
Settlement            0
Building Dimension    0
Building_Type         0
Date_of_Occupancy     0
Geo_Code              0
Claim                 0
dtype: int64

In [18]:
test = test.drop('NumberOfWindows', axis = 1)

In [19]:
categorical_features = ['Building_Painted', 'Building_Fenced', 'Garden', 'Settlement', 'Geo_Code']

test['Building_Painted'] = test.Building_Painted.astype('category')
test['Building_Fenced'] = test.Building_Fenced.astype('category')
test['Garden'] = test.Garden.astype('category')
test['Settlement'] = test.Settlement.astype('category')
test['Geo_Code'] = test.Geo_Code.astype('category')

In [20]:
for feature in categorical_features:
    test[feature] = test[feature].cat.codes

In [21]:
# Using the top three models; XGBoost, Catboost and Gradientboost to train and make predictions
# Creating a list of models to use
models = [SVC(), LogisticRegression()]
model_names = ['svc', 'linear']


# Submission dataset
#
sub = test.drop('Customer Id', axis = 1)


# Using a for loop to create a submission file for each model
#
for model, model_name in zip(models, model_names):
  regressor = model                      # instantiating the model
  regressor.fit(X, y)                    # Training the model
  predictions  = regressor.predict(sub)  # Making predictions
  submission_df = pd.DataFrame({'Customer Id': test['Customer Id'], 'target_output': predictions}) # Creating a submission file
  submission_df.to_csv(model_name + '_baseline.csv', index = False)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').