In [1]:
import pandas as pd #Handles the Database related Tables and file reading.
#Pandas is a globalized library.
dataset = pd.read_csv("insurance_pre.csv")

In [2]:
#converting categorical data into numerical data
dataset=pd.get_dummies(dataset, columns=['sex', 'smoker'], drop_first=True)

In [3]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [4]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,False,True
1,18,33.770,1,1725.55230,True,False
2,28,33.000,3,4449.46200,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.880,0,3866.85520,True,False
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False
1334,18,31.920,0,2205.98080,False,False
1335,18,36.850,0,1629.83350,False,False
1336,21,25.800,0,2007.94500,False,False


In [5]:
independent=dataset[["age","bmi","children","sex_male","smoker_yes"]] # Here we have stored the YearsExperience and Salary(input and output) inside the dataset variable.
#So how can we take only the "YearsExperience" column seperately using [["Column_Name"]]. If we need multiple column means we can use [["","",""]] like that
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,False,True
1,18,33.770,1,True,False
2,28,33.000,3,True,False
3,33,22.705,0,True,False
4,32,28.880,0,True,False
...,...,...,...,...,...
1333,50,30.970,3,True,False
1334,18,31.920,0,False,False
1335,18,36.850,0,False,False
1336,21,25.800,0,False,False


In [6]:
dependent=dataset[["charges"]]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [7]:
# So as per the moddel creation rule, I've splitted the input and output variables seperately.
# Next step is to split the training set and the test set.

In [8]:
from sklearn.model_selection import train_test_split #Here we have to call the Train_test_split function form the Model selection 
X_train,X_test,y_train,y_test=train_test_split(independent, dependent,test_size=0.30,random_state=0) #here we have loaded the particular function
#Here the test_size is in the 30:70 ratio.
#Here we have added the X's train & test sets and Y's train & test sets
#With this position only the sklearn library will work.(This sklearn will provide the machine learning algorithm's

In [9]:
X_train

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
1163,18,28.215,0,False,False
196,39,32.800,0,False,False
438,52,46.750,5,False,False
183,44,26.410,0,False,False
1298,33,27.455,2,True,False
...,...,...,...,...,...
763,27,26.030,0,True,False
835,42,35.970,2,True,False
1216,40,25.080,0,True,False
559,19,35.530,0,True,False


In [10]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression() # Non-parameterized function. This linearRegression() will consider as a class
# simply these above 2 lines are just a procedures of classes and functions
regressor.fit(X_train,y_train) #We've called the fit() function from the regressor. now we can send the material from the training set(input and output) inside it.
#fit() method - will substitute the values inside the input.
#Here the weight(W) and bias(b) will be calculated here for [y=wx+b]



In [11]:
#So here we've check the weight/slope and intercept/bias.
weight=regressor.coef_ #The coefficient method is to calculate the weight
weight #This is a hook point for next prediction.

array([[  257.8006705 ,   321.06004271,   469.58113407,   -41.74825718,
        23418.6671912 ]])

In [12]:
bias=regressor.intercept_
bias

array([-12057.244846])

In [13]:
#next step is Evaluation Metrics

In [14]:
y_pred=regressor.predict(X_test) #Here the evaluation metrics will be only for X test sets. Y tests are actual data

In [15]:
# Now we have to find the R Squared value.
from sklearn.metrics import r2_score
r_score=r2_score(y_test,y_pred) #actual and predicted sets.

In [16]:
r_score

0.7894790349867009

In [17]:
# If the R-Squared is nearly to 1, then this model is good. Otherwise it'll be bad model.

#Next step is we have to save the created models.

In [18]:
import pickle
filename="finalized_model.sav" # file name has been created.

In [19]:
# Here we have to insert the already created model.
pickle.dump(regressor, open("finalized_model.sav", 'wb')) # Here the 'wb' is "Write Binary".

# With this we have saved the model.

In [20]:
# Next step is to load the saved model.
loaded_model=pickle.load(open("finalized_model.sav",'rb'))
#Here we have to check with the user input in real time.
result=loaded_model.predict([[13, 25.0, 0, 1, 0]])



In [21]:
result

array([[-721.08331907]])

In [22]:
# Next step is to deploy the model