In [None]:
import pandas as pd

# Load dataset
data_path = '/content/insurance.csv'
data = pd.read_csv(data_path)

# See general information about the dataset & 5 sample data points
print(data.info())
data.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
from sklearn.preprocessing import LabelEncoder

# Label encoder initialization
le_sex = LabelEncoder()
le_smoker = LabelEncoder()
le_region = LabelEncoder()

# Label encoding
data['sex_encoded'] = le_sex.fit_transform(data.sex)
data['smoker_encoded'] = le_smoker.fit_transform(data.smoker)
data['region_encoded'] = le_region.fit_transform(data.region)

# See the encoding mapping 
# (categorical value encoded by the index)
print('sex column encoding mapping : %s' % list(le_sex.classes_))
print('smoker column encoding mapping : %s' % list(le_smoker.classes_))
print('region column encoding mapping : %s' % list(le_region.classes_))

# See label encoding result
data.head(5)

sex column encoding mapping : ['female', 'male']
smoker column encoding mapping : ['no', 'yes']
region column encoding mapping : ['northeast', 'northwest', 'southeast', 'southwest']


Unnamed: 0,age,sex,bmi,children,smoker,region,charges,sex_encoded,smoker_encoded,region_encoded
0,19,female,27.9,0,yes,southwest,16884.924,0,1,3
1,18,male,33.77,1,no,southeast,1725.5523,1,0,2
2,28,male,33.0,3,no,southeast,4449.462,1,0,2
3,33,male,22.705,0,no,northwest,21984.47061,1,0,1
4,32,male,28.88,0,no,northwest,3866.8552,1,0,1


In [None]:
from sklearn.preprocessing import OneHotEncoder

# One hot encoder initialization
ohe_region = OneHotEncoder()

# One hot encoding (OHE) to array
arr_ohe_region = ohe_region.fit_transform(data.region_encoded.values.reshape(-1,1)).toarray()

# Convert array OHE to dataframe and append to existing dataframe
dfOneHot = pd.DataFrame(arr_ohe_region, columns=['region_'+str(i) for i in range(arr_ohe_region.shape[1])])
data = pd.concat([data, dfOneHot], axis=1)

# See the preprocessing result
data.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,sex_encoded,smoker_encoded,region_encoded,region_0,region_1,region_2,region_3
0,19,female,27.9,0,yes,southwest,16884.924,0,1,3,0.0,0.0,0.0,1.0
1,18,male,33.77,1,no,southeast,1725.5523,1,0,2,0.0,0.0,1.0,0.0
2,28,male,33.0,3,no,southeast,4449.462,1,0,2,0.0,0.0,1.0,0.0
3,33,male,22.705,0,no,northwest,21984.47061,1,0,1,0.0,1.0,0.0,0.0
4,32,male,28.88,0,no,northwest,3866.8552,1,0,1,0.0,1.0,0.0,0.0


In [None]:
# Drop categorical features
preprocessed_data = data.drop(['sex','smoker','region',
                               'region_encoded'], axis=1)

# See the preprocessing final result
preprocessed_data.head(5)

Unnamed: 0,age,bmi,children,charges,sex_encoded,smoker_encoded,region_0,region_1,region_2,region_3
0,19,27.9,0,16884.924,0,1,0.0,0.0,0.0,1.0
1,18,33.77,1,1725.5523,1,0,0.0,0.0,1.0,0.0
2,28,33.0,3,4449.462,1,0,0.0,0.0,1.0,0.0
3,33,22.705,0,21984.47061,1,0,0.0,1.0,0.0,0.0
4,32,28.88,0,3866.8552,1,0,0.0,1.0,0.0,0.0


In [None]:
from sklearn.model_selection import train_test_split
# Split the dataset to training and testing
train, test = train_test_split(preprocessed_data, test_size=0.2)

# Split the feature and the target
train_y = train.charges.values
train_x = train.drop(columns=['charges']).values
test_y = test.charges.values
test_x = test.drop(columns=['charges']).values


In [None]:
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
train_x=sc.fit_transform(train_x)
test_x=sc.transform(test_x)

In [None]:
print(train_x) #just to check if scaling worked or not

[[-0.7654396  -0.66050351  0.75265798 ... -0.56223942  1.65173333
  -0.57087511]
 [ 0.45387438  0.06599382  0.75265798 ... -0.56223942 -0.60542461
   1.75169662]
 [-1.05233701  0.88681996 -0.89090128 ... -0.56223942  1.65173333
  -0.57087511]
 ...
 [-1.48268312  1.43293413 -0.89090128 ... -0.56223942  1.65173333
  -0.57087511]
 [ 0.66904744 -0.26829424 -0.06912165 ...  1.77860173 -0.60542461
  -0.57087511]
 [ 1.67318836 -0.4883948  -0.06912165 ...  1.77860173 -0.60542461
  -0.57087511]]


In [None]:
print(train_x.shape)

(1070, 9)


In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
train_x = pca.fit_transform(train_x)
test_x = pca.transform(test_x)

In [None]:
print(train_x.shape)

(1070, 2)


In [None]:
principalDf = pd.DataFrame(data = train_x
             , columns = ['principal component 1', 'principal component 2'])
print(principalDf)

      principal component 1  principal component 2
0                  1.041125              -0.653678
1                  0.053285               2.082825
2                  1.878230              -0.791684
3                 -0.456325               1.833223
4                 -1.728930              -0.095085
...                     ...                    ...
1065              -1.173930              -0.176977
1066              -0.204463               0.176474
1067               2.011528              -0.756199
1068              -1.052620               0.015687
1069              -0.983891               0.020364

[1070 rows x 2 columns]


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
clf=RandomForestClassifier(n_estimators=100, random_state=0)

# Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(train_x, train_y)

# Prediction on test set
y_pred=clf.predict(test_x)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(test_y, y_pred))

ValueError: ignored