In [28]:
# 1- Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [29]:
# 2- Read the file
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [30]:
# 3- convert the catedorical data into a coded form that consists of integers
df['sex'] = df['sex'].astype('category')
df['sex'] = df['sex'].cat.codes
df.head()
# 0 for females and 1 for males

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [31]:
df['smoker'] = df['smoker'].astype('category')
df['smoker'] = df['smoker'].cat.codes
df['region'] = df['region'].astype('category')
df['region'] = df['region'].cat.codes
print(df.head())
print(df.dtypes)

   age  sex     bmi  children  smoker  region      charges
0   19    0  27.900         0       1       3  16884.92400
1   18    1  33.770         1       0       2   1725.55230
2   28    1  33.000         3       0       2   4449.46200
3   33    1  22.705         0       0       1  21984.47061
4   32    1  28.880         0       0       1   3866.85520
age           int64
sex            int8
bmi         float64
children      int64
smoker         int8
region         int8
charges     float64
dtype: object


In [32]:
# 4-Drop rows with NAN's and duplicates
nan = df[df.isna().any(axis = 1)]
print(nan.shape)
print(nan.head())

(0, 7)
Empty DataFrame
Columns: [age, sex, bmi, children, smoker, region, charges]
Index: []


In [33]:
# another way to do it
# it sums up all Null cells in each column
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [34]:
print('Size before removing duplicates:',df.shape)
df = df.drop_duplicates()
print('Size after removing duplicates:',df.shape)

Size before removing duplicates: (1338, 7)
Size after removing duplicates: (1337, 7)


In [35]:
# 5- Sorting our data based on charges to see if there are negative values or any issues
df = df.sort_values(by = 'charges', ascending= False)
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
543,54,0,47.410,0,1,2,63770.42801
1300,45,1,30.360,0,1,2,62592.87309
1230,52,1,34.485,3,1,1,60021.39897
577,31,0,38.095,1,1,0,58571.07448
819,33,0,35.530,0,1,1,55135.40209
...,...,...,...,...,...,...,...
22,18,1,34.100,0,0,2,1137.01100
663,18,1,33.660,0,0,2,1136.39940
1244,18,1,33.330,0,0,2,1135.94070
808,18,1,30.140,0,0,2,1131.50660


In [36]:
# 6- creating our x and y values
x = df.drop(columns= 'charges')
y = df['charges']

In [37]:
# 7- splitting our data into training and testing portions
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.3, random_state= 0) 
# Note: when you enter an integer to random_state, you guarantee obtaining the same testing and training data sets,
# whether you run one time or thousand times.
# 8- use the training portion of the data to train the linear regression model

In [38]:
lr = LinearRegression()
lr.fit(x_train, y_train)

In [39]:
b = lr.intercept_
w = lr.coef_
print(b)
print(w)
y_pred = np.dot(x_test, np.reshape(w, (-1, 1))) + b

-11137.530573923525
[  256.04500064  -169.3846879    325.8963272    362.17641192
 23680.67429101  -518.73722406]
