In [129]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk


In [130]:
dataset = pd.read_csv("../Dataset/insurance.csv")


In [131]:
df = pd.DataFrame(dataset)

In [132]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [133]:
X =df[df.columns[:-1]] # Feaature  columns
y = df[df.columns[-1]] #Target columns

In [134]:
print(X)

      age     sex     bmi  children smoker     region
0      19  female  27.900         0    yes  southwest
1      18    male  33.770         1     no  southeast
2      28    male  33.000         3     no  southeast
3      33    male  22.705         0     no  northwest
4      32    male  28.880         0     no  northwest
...   ...     ...     ...       ...    ...        ...
1333   50    male  30.970         3     no  northwest
1334   18  female  31.920         0     no  northeast
1335   18  female  36.850         0     no  southeast
1336   21  female  25.800         0     no  southwest
1337   61  female  29.070         0    yes  northwest

[1338 rows x 6 columns]


In [135]:
print(y)

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64


In [136]:
X.isnull().sum() # dedicte How many colmns is null

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64

In [137]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


In [138]:
import warnings
warnings.filterwarnings('ignore')

In [139]:
#check if the all columns types is correct and dedicte it
X['bmi'] = pd.to_numeric(X['bmi'], errors='coerce')
X['age'] = pd.to_numeric(X['age'], errors='coerce')
X['children'] = pd.to_numeric(X['children'], errors='coerce')

X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.900,0,yes,southwest
1,18,male,33.770,1,no,southeast
2,28,male,33.000,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.880,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest
1334,18,female,31.920,0,no,northeast
1335,18,female,36.850,0,no,southeast
1336,21,female,25.800,0,no,southwest


In [140]:
#Filling missing data with the fillna way
X['age'].fillna(X['age'].mean(), inplace=True)
X['sex'].fillna(X['sex'].mode()[0], inplace=True)
X['region'].fillna(X['region'].mode()[0], inplace=True)
X['bmi'].fillna(X['bmi'].mean(), inplace=True)
X['children'].fillna(X['children'].mode()[0], inplace=True)
X['smoker'].fillna(X['smoker'].mode()[0], inplace=True)

print(X.isnull().sum())
X

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64


Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.900,0,yes,southwest
1,18,male,33.770,1,no,southeast
2,28,male,33.000,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.880,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest
1334,18,female,31.920,0,no,northeast
1335,18,female,36.850,0,no,southeast
1336,21,female,25.800,0,no,southwest


In [141]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
#Label Encoding for sex and smoker columns
X["sex"] = le.fit_transform(df["sex"])
X["smoker"] = le.fit_transform(df["smoker"])

#Encoding with Dummies way for the region column
X = pd.get_dummies(X,columns=['region'],drop_first=False)

In [142]:
X["region_northeast"] = X["region_northeast"].astype(int)
X["region_northwest"] = X["region_northwest"].astype(int)
X["region_southeast"] = X["region_southeast"].astype(int)
X["region_southwest"] = X["region_southwest"].astype(int)

In [143]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,0,0,0,1
1,18,1,33.770,1,0,0,0,1,0
2,28,1,33.000,3,0,0,0,1,0
3,33,1,22.705,0,0,0,1,0,0
4,32,1,28.880,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,0,1,0,0
1334,18,0,31.920,0,0,1,0,0,0
1335,18,0,36.850,0,0,0,0,1,0
1336,21,0,25.800,0,0,0,0,0,1


In [144]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=10)

In [145]:
print(X_train)

      age  sex     bmi  children  smoker  region_northeast  region_northwest  \
559    19    1  35.530         0       0                 0                 1   
273    50    1  27.455         1       0                 1                 0   
471    18    0  30.115         0       0                 1                 0   
22     18    1  34.100         0       0                 0                 0   
939    53    1  29.480         0       0                 0                 0   
...   ...  ...     ...       ...     ...               ...               ...   
1180   42    0  41.325         1       0                 1                 0   
1147   20    0  31.920         0       0                 0                 1   
527    51    0  25.800         1       0                 0                 0   
1149   42    1  34.100         0       0                 0                 0   
1289   44    1  34.320         1       0                 0                 0   

      region_southeast  region_southwes

In [146]:
print(X_test)

      age  sex     bmi  children  smoker  region_northeast  region_northwest  \
7      37    0  27.740         3       0                 0                 1   
999    36    0  26.885         0       0                 0                 1   
1209   59    1  37.100         1       0                 0                 0   
491    61    0  25.080         0       0                 0                 0   
625    29    0  26.030         0       0                 0                 1   
...   ...  ...     ...       ...     ...               ...               ...   
271    50    1  34.200         2       1                 0                 0   
695    26    0  40.185         0       0                 0                 1   
1039   19    1  27.265         2       0                 0                 1   
323    57    1  40.945         0       0                 1                 0   
835    42    1  35.970         2       0                 0                 0   

      region_southeast  region_southwes

In [147]:
print(y_train)

559     1646.42970
273     9617.66245
471     2203.47185
22      1137.01100
939     9487.64420
           ...    
1180    7650.77375
1147    2261.56880
527     9861.02500
1149    5979.73100
1289    7147.47280
Name: charges, Length: 1070, dtype: float64


In [148]:
print(y_test)

7        7281.50560
999      5267.81815
1209    12347.17200
491     24513.09126
625      3736.46470
           ...     
271     42856.83800
695      3201.24515
1039    22493.65964
323     11566.30055
835      7160.33030
Name: charges, Length: 268, dtype: float64


In [149]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,0,0,0,1
1,18,1,33.770,1,0,0,0,1,0
2,28,1,33.000,3,0,0,0,1,0
3,33,1,22.705,0,0,0,1,0,0
4,32,1,28.880,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,0,1,0,0
1334,18,0,31.920,0,0,1,0,0,0
1335,18,0,36.850,0,0,0,0,1,0
1336,21,0,25.800,0,0,0,0,0,1


In [150]:
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [151]:
#Feature Scaling MinMax
from sklearn.preprocessing import MinMaxScaler
X_train = np.array(X_train)
X_test = np.array(X_test)

mm = MinMaxScaler()
cols = [0, 2]

X_train[:, cols] = mm.fit_transform(X_train[:, cols].astype(float))
X_test[:, cols] = mm.transform(X_test[:, cols].astype(float))

In [152]:
X_train

array([[0.02173913, 1.        , 0.52649987, ..., 1.        , 0.        ,
        0.        ],
       [0.69565217, 1.        , 0.30925478, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.38081786, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.7173913 , 0.        , 0.26472962, ..., 0.        , 0.        ,
        1.        ],
       [0.52173913, 1.        , 0.48802798, ..., 0.        , 0.        ,
        1.        ],
       [0.56521739, 1.        , 0.49394673, ..., 0.        , 1.        ,
        0.        ]], shape=(1070, 9))

In [153]:
X_test

array([[0.41304348, 0.        , 0.31692225, ..., 1.        , 0.        ,
        0.        ],
       [0.39130435, 0.        , 0.29391983, ..., 1.        , 0.        ,
        0.        ],
       [0.89130435, 1.        , 0.56873823, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.02173913, 1.        , 0.30414313, ..., 1.        , 0.        ,
        0.        ],
       [0.84782609, 1.        , 0.67218187, ..., 0.        , 0.        ,
        0.        ],
       [0.52173913, 1.        , 0.53833737, ..., 0.        , 1.        ,
        0.        ]], shape=(268, 9))

In [154]:
import joblib

joblib.dump(X_train, "X_train.pkl")
joblib.dump(X_test, "X_test.pkl")
joblib.dump(y_train, "y_train.pkl")
joblib.dump(y_test, "y_test.pkl")


['y_test.pkl']