# Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

from sklearn.svm import SVR
from sklearn import metrics

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Importing the dataset and printing the values in dataset

In [4]:
#import file
data= pd.read_csv(r"C:\Users\pbhal\OneDrive\Desktop\Ex_Files_Python_Predictive_Analytics\Ex_Files_Python_Predictive_Analytics\Exercise Files\Datasets\insurance.csv")

In [5]:
print(data.head(10))

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
5   31  female  25.740         0     no  southeast   3756.62160
6   46  female  33.440         1     no  southeast   8240.58960
7   37  female  27.740         3     no  northwest   7281.50560
8   37    male  29.830         2     no  northeast   6406.41070
9   60  female  25.840         0     no  northwest  28923.13692


# Checking the number of missing values and the column that has missing values

In [6]:
#Check the number of missing values 
count_nan= data.isnull().sum()
print(count_nan[count_nan>0])

bmi    5
dtype: int64


# Treating the missing values by replacing them with mean value

In [7]:
#filling the missing values with the mean of the dataset
data['bmi'].fillna(data['bmi'].mean(), inplace=True)

In [8]:
#to check the number of missing values after preprocessing
count_nan= data.isnull().sum()
print(count_nan[count_nan>0])

Series([], dtype: int64)


# Converting Categorical data to numerical data using LABEL ENCODING and HOT ENCODING

In [9]:
#converting categorical values to numerical data using label enconding and one hot encoding

sex= data.iloc[:,1:2].values
smoker= data.iloc[:,4:5].values

le=LabelEncoder()
sex[:,0]= le.fit_transform(sex[:,0])
sex=pd.DataFrame(sex)
sex.columns = ['sex']
le_sex_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("sklearn label econding result for variable sex")
print( le_sex_mapping)
print(sex[:10])

smoker[:,0]= le.fit_transform(smoker[:,0])
smoker = pd.DataFrame(smoker)
smoker.columns = ['smoker']
le_smoker_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("sklearn label econding result for variable smoker")
print( le_smoker_mapping)
print(smoker[:10])

#one hot encondeing
region= data.iloc[:,5:6].values
ohe= OneHotEncoder()
region = ohe.fit_transform(region).toarray()
region = pd.DataFrame(region)
region.columns = ['Northeasst', 'Northwest', 'Southeast', 'SouthWest']
print("sklearn one hot econding result for variable region")
print(region[:10])

                         

sklearn label econding result for variable sex
{'female': 0, 'male': 1}
  sex
0   0
1   1
2   1
3   1
4   1
5   0
6   0
7   0
8   1
9   0
sklearn label econding result for variable smoker
{'no': 0, 'yes': 1}
  smoker
0      1
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
sklearn one hot econding result for variable region
   Northeasst  Northwest  Southeast  SouthWest
0         0.0        0.0        0.0        1.0
1         0.0        0.0        1.0        0.0
2         0.0        0.0        1.0        0.0
3         0.0        1.0        0.0        0.0
4         0.0        1.0        0.0        0.0
5         0.0        0.0        1.0        0.0
6         0.0        0.0        1.0        0.0
7         0.0        1.0        0.0        0.0
8         1.0        0.0        0.0        0.0
9         0.0        1.0        0.0        0.0


# Creating the Training and Test Data

In [10]:
#creating training and test data

x_num = data[['age', 'bmi', 'children']]
x_final = pd.concat([x_num, sex, smoker, region], axis= 1)
y_final = data[['charges']].copy()

x_train , x_test, y_train, y_test = train_test_split(x_final, y_final, test_size= 0.33, random_state= 0)

# Feature Scaling using normalization

In [11]:
#feature scalilng 
#using normalization
n_scaler = MinMaxScaler()
x_train = n_scaler.fit_transform(x_train.astype(np.float64))
x_test = n_scaler.transform(x_test.astype(np.float64))

In [12]:
x_train

array([[0.15217391, 0.49838579, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.02173913, 0.50417003, 0.4       , ..., 0.        , 0.        ,
        1.        ],
       [0.2173913 , 0.45843422, 0.6       , ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.47826087, 0.24535916, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.02173913, 0.52649987, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.32608696, 0.06833468, 0.2       , ..., 0.        , 0.        ,
        1.        ]])

In [13]:
x_test

array([[0.73913043, 0.38310465, 0.2       , ..., 0.        , 0.        ,
        1.        ],
       [0.63043478, 0.36077482, 0.2       , ..., 0.        , 1.        ,
        0.        ],
       [0.65217391, 0.66195857, 0.4       , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.47826087, 0.45238095, 0.2       , ..., 0.        , 0.        ,
        0.        ],
       [0.15217391, 0.29163304, 0.6       , ..., 0.        , 0.        ,
        1.        ],
       [0.04347826, 0.29270917, 0.2       , ..., 0.        , 1.        ,
        0.        ]])

# Feature scaling using standard scaler

In [14]:
s_scaler = StandardScaler()
x_train = s_scaler.fit_transform(x_train.astype(np.float64))
x_test = s_scaler.transform(x_test.astype(np.float64))

In [15]:
x_train

array([[-1.03606235,  0.61468414, -0.91008919, ...,  1.77430216,
        -0.61002347, -0.58766091],
       [-1.46484381,  0.64957783,  0.7540739 , ..., -0.56360186,
        -0.61002347,  1.70166159],
       [-0.82167162,  0.37367425,  1.58615545, ..., -0.56360186,
         1.63928118, -0.58766091],
       ...,
       [ 0.03589131, -0.91171181, -0.91008919, ..., -0.56360186,
         1.63928118, -0.58766091],
       [-1.46484381,  0.78428369, -0.91008919, ...,  1.77430216,
        -0.61002347, -0.58766091],
       [-0.46435373, -1.97962095, -0.07800765, ..., -0.56360186,
        -0.61002347,  1.70166159]])

In [16]:
x_test

array([[ 0.89345423, -0.08075516, -0.07800765, ..., -0.56360186,
        -0.61002347,  1.70166159],
       [ 0.53613634, -0.21546103, -0.07800765, ..., -0.56360186,
         1.63928118, -0.58766091],
       [ 0.60759992,  1.60144516,  0.7540739 , ...,  1.77430216,
        -0.61002347, -0.58766091],
       ...,
       [ 0.03589131,  0.3371576 , -0.07800765, ..., -0.56360186,
        -0.61002347, -0.58766091],
       [-1.03606235, -0.63256231,  1.58615545, ..., -0.56360186,
        -0.61002347,  1.70166159],
       [-1.39338023, -0.62607047, -0.07800765, ..., -0.56360186,
         1.63928118, -0.58766091]])

# Performing Linear Regression

In [17]:
#Linear regression
lr = LinearRegression().fit(x_train, y_train)
y_train_pred = lr.predict(x_train)
y_test_pred = lr.predict(x_test)

lr.score(x_train, y_train)
lr.score(x_test, y_test)

0.7804031221891603

In [18]:
poly = PolynomialFeatures(degree= 3)
x_poly = poly.fit_transform(x_final)

x_train , x_test, y_train, y_test = train_test_split(x_final, y_final, test_size= 0.33, random_state= 0)

s_scaler = StandardScaler()
x_train = s_scaler.fit_transform(x_train.astype(np.float64))
x_test = s_scaler.transform(x_test.astype(np.float64))

poly_lr= LinearRegression().fit(x_train, y_train)
y_train_pred = poly_lr. predict(x_train)
y_test_pred = poly_lr.predict(x_test)

poly_lr.score(x_train, y_train)
poly_lr.score(x_test, y_test)


0.7855951871694039

# Performing Support Vector model

In [19]:
#support vector model

svr = SVR(kernel= 'poly', C= 300)

x_train , x_test, y_train, y_test = train_test_split(x_final, y_final, test_size= 0.33, random_state= 0)

s_scaler = StandardScaler()
x_train = s_scaler.fit_transform(x_train.astype(np.float64))
x_test = s_scaler.transform(x_test.astype(np.float64))

svr = svr.fit(x_train, y_train.values.ravel())
y_train_pred = svr.predict(x_train)
y_test_pred = svr.predict(x_test)

print('svr train score %.3f, svr test score: %.3f' % (
svr.score(x_train,y_train),
svr.score(x_test, y_test)))




svr train score 0.568, svr test score: 0.566


# Creating a decision tree model

In [20]:
#Decision tree alogorithm
dt = DecisionTreeRegressor(random_state= 0)

x_train , x_test, y_train, y_test = train_test_split(x_final, y_final, test_size= 0.33, random_state= 0)

s_scaler = StandardScaler()
x_train = s_scaler.fit_transform(x_train.astype(np.float64))
x_test = s_scaler.transform(x_test.astype(np.float64))

#fitting the model
dt = dt.fit(x_train, y_train.values.ravel())
y_train_pred = dt.predict(x_train)
y_train_pred = dt.predict(x_test)

#print score
print('dt train score %.3f, dt test score: %.3f' % (
dt.score(x_train,y_train),
dt.score(x_test, y_test)))


dt train score 0.999, dt test score: 0.717


# Performing Random Forest Regression Model

In [19]:
#Random forest regression 
forest = RandomForestRegressor(n_estimators= 100,
                              criterion= 'mse',
                              random_state= 1,
                              n_jobs= -1)

x_train , x_test, y_train, y_test = train_test_split(x_final, y_final, test_size= 0.33, random_state= 0)

s_scaler = StandardScaler()
x_train = s_scaler.fit_transform(x_train.astype(np.float64))
x_test = s_scaler.transform(x_test.astype(np.float64))

#fitting the model
forest = forest.fit(x_train, y_train.values.ravel())
y_train_pred = dt.predict(x_train)
y_train_pred = dt.predict(x_test)

#print score
print('forest regression train score %.3f,fores regression test score: %.3f' % (
dt.score(x_train,y_train),
dt.score(x_test, y_test)))


forest regression train score 0.999,fores regression test score: 0.717


# Hyper Parameterization

In [21]:
def print_best_params(gd_model):
    param_dict = gd_model.best_estimator_.get_params()
    model_str = str(gd_model.estimator).split('(')[0]
    print("\n*** {} Best Parameters ***".format(model_str))
    for k in param_dict:
        print("{}: {}".format(k, param_dict[k]))
    print()
    
#test train split
x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size = 0.33, random_state = 0 )

#standard scaler (fit transform on train, fit only on test)
sc = StandardScaler()
x_train = sc.fit_transform(x_train.astype(np.float64))
x_test= sc.transform(x_test.astype(np.float64))
###Challenge 1: SVR parameter grid###

param_grid_svr = dict(kernel=[ 'linear', 'poly'],
                     degree=[2],
                     C=[600, 700, 800, 900],
                     epsilon=[0.0001, 0.00001, 0.000001])
svr = GridSearchCV(SVR(), param_grid=param_grid_svr, cv=5, verbose=3)



svr= svr.fit(x_train, y_train.values.ravel())
#print score
print('hyper parameter train score %.3f,hyperparameter regression test score: %.3f' % (
svr.score(x_train,y_train),
svr.score(x_test, y_test)))


#fit model
print_best_params(svr)
#print(svr.best_estimator_.get_params())



Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END .C=600, degree=2, epsilon=0.0001, kernel=linear; total time=   0.0s
[CV 2/5] END .C=600, degree=2, epsilon=0.0001, kernel=linear; total time=   0.0s
[CV 3/5] END .C=600, degree=2, epsilon=0.0001, kernel=linear; total time=   0.0s
[CV 4/5] END .C=600, degree=2, epsilon=0.0001, kernel=linear; total time=   0.0s
[CV 5/5] END .C=600, degree=2, epsilon=0.0001, kernel=linear; total time=   0.0s
[CV 1/5] END ...C=600, degree=2, epsilon=0.0001, kernel=poly; total time=   0.0s
[CV 2/5] END ...C=600, degree=2, epsilon=0.0001, kernel=poly; total time=   0.0s
[CV 3/5] END ...C=600, degree=2, epsilon=0.0001, kernel=poly; total time=   0.0s
[CV 4/5] END ...C=600, degree=2, epsilon=0.0001, kernel=poly; total time=   0.0s
[CV 5/5] END ...C=600, degree=2, epsilon=0.0001, kernel=poly; total time=   0.0s
[CV 1/5] END ..C=600, degree=2, epsilon=1e-05, kernel=linear; total time=   0.0s
[CV 2/5] END ..C=600, degree=2, epsilon=1e-05, 

In [24]:
#parameter grid for decision tree

param_grid_dt = dict(min_samples_leaf=np.arange(9, 13, 1, int), 
                  max_depth = np.arange(4,7,1, int),
                  min_impurity_decrease = [0, 1, 2],
                 )

dt = GridSearchCV(DecisionTreeRegressor(random_state=0), param_grid=param_grid_dt, cv=5,  verbose=3)

#fit model
dt = dt.fit(x_train,y_train.values.ravel())


#print score
print('\n\ndt train score %.3f, dt test score: %.3f' % (
dt.score(X_train,y_train),
dt.score(X_test, y_test)))
print_best_params(dt)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END max_depth=4, min_impurity_decrease=0, min_samples_leaf=9; total time=   0.0s
[CV 2/5] END max_depth=4, min_impurity_decrease=0, min_samples_leaf=9; total time=   0.0s
[CV 3/5] END max_depth=4, min_impurity_decrease=0, min_samples_leaf=9; total time=   0.0s
[CV 4/5] END max_depth=4, min_impurity_decrease=0, min_samples_leaf=9; total time=   0.0s
[CV 5/5] END max_depth=4, min_impurity_decrease=0, min_samples_leaf=9; total time=   0.0s
[CV 1/5] END max_depth=4, min_impurity_decrease=0, min_samples_leaf=10; total time=   0.0s
[CV 2/5] END max_depth=4, min_impurity_decrease=0, min_samples_leaf=10; total time=   0.0s
[CV 3/5] END max_depth=4, min_impurity_decrease=0, min_samples_leaf=10; total time=   0.0s
[CV 4/5] END max_depth=4, min_impurity_decrease=0, min_samples_leaf=10; total time=   0.0s
[CV 5/5] END max_depth=4, min_impurity_decrease=0, min_samples_leaf=10; total time=   0.0s
[CV 1/5] END max_depth=4, min_imp

NameError: name 'X_train' is not defined