In [None]:
medical_charges_url = 'https://raw.githubusercontent.com/JovianML/opendatasets/master/data/medical-charges.csv'

In [None]:
medical_charges_url

In [None]:
# downloading the data using urllib.request

from urllib.request import urlretrieve
urlretrieve(medical_charges_url, 'medical.csv')

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Data Analysis

In [None]:
df = pd.read_csv('medical.csv')
df.head()

In [None]:
df.info()

In [None]:
df.age.min(), df.age.max()

In [None]:
df.sex.value_counts()

In [None]:
df.bmi.min(), df.bmi.max()

In [None]:
df.children.value_counts()

In [None]:
df.smoker.value_counts()

In [None]:
df.region.value_counts()
# we have 4 regions

In [None]:
df.charges.min(), df.charges.max()
# how much they spend on medical thing

In [None]:
df.describe()

In [None]:
# setting the style of seaborn and matplot

# sns.set_style('darkgrid')
# matplotlib.rcParams['font.size'] = 14
# matplotlib.rcParams['figure.figsize'] = (10,6)
# matplotlib.rcParams['figure.facecolor'] = '#00000000'


In [None]:
sns.set(font_scale=0.6)

In [None]:
sns.countplot(df, x='age')
# dstribution of age

- distribution of age is uniform except those for age of 18 and 19
- one of the reason may be that you are eligible to apply for ensurance after age of 18

In [None]:
sns.histplot(df.bmi)

- BMI is normally distributed
- Most of the people are overweight

In [None]:
sns.histplot(df, x='charges', hue='smoker')

- Most people are spending lower than 15k
- Smoker having higher medical expense

In [None]:
sns.barplot(df, y='charges', x='region')

In [None]:
sns.barplot(df, x='sex', y='charges')

In [None]:
sns.barplot(df, x='children', y='charges')

- People with 5 children are spending less
- This may be because of their higher regular expenses

In [None]:
smoker_counts_by_sex = df.groupby(['sex', 'smoker']).size()
smoker_counts_by_sex

In [None]:
sns.catplot(df, x='smoker', hue='sex', kind='count')

- 20% people of smokers
- Relatively more male smokers than female one

In [None]:
sns.scatterplot(df, x='age', y='charges', hue='smoker')

- There are 3 clusters --> nearly linear
- Non smoker having data in lower two sections and smoker having data in upper two sections

- We can see that every time smoker person is making two groups this maybe due to their smoking habits that some smoke occasionally and some regularly

In [None]:
sns.scatterplot(df, x='bmi', y='charges', hue='smoker')

- Here we can also see two clusters of smokers one is bmi < 30 and other is bmi > 30
- Smoking combined with obesity leads more medical expenses

In [None]:
sns.violinplot(df, x='children', y='charges')

In [None]:
df.charges.corr(df.age)

In [None]:
df.charges.corr(df.bmi)

In [None]:
df.charges.corr(df.children)

In [None]:
df.smoker.head()

In [None]:
smoker_dict = {'yes': 1, 'no': 0}
smoker_map = df.smoker.map(smoker_dict)
smoker_map.head()

- highest correlation till now --> strong positive correlation

In [None]:
df.charges.corr(smoker_map)
# Strogly correlated

In [None]:
df_corr = df.select_dtypes(include=['number'])
df_corr.corr()

In [None]:
sns.heatmap(df_corr.corr())

### Linear Regression using a Single Feature

In [None]:
ns_df = df[df.smoker == 'no']
ns_df.head()

> we have a strong correlated of charges with age

In [None]:
sns.scatterplot(ns_df, x='age', y='charges')

In [None]:
sns.regplot(ns_df, x='age', y='charges')

In [None]:
sns.lmplot(df, x='age', y='charges', hue='smoker')

- for linear regression between 'age' and 'charges' we have raltion:
1.  y = m * x + c
2.  charges = w * age + b

- In machine learning we call ( slope --> weight ) and ( intercept --> bias )

In [None]:
def es_charges(age, w, b):
  return w*age + b

In [None]:
w=50
b=100
es_charges(40, w, b)
# wrong estimation

In [None]:
ages = ns_df.age
est_charges = es_charges(ages, w, b)
est_charges
# way far from the original values

In [None]:
sns.lineplot(ns_df, x='age', y='charges')

In [None]:
sns.lineplot(x=ages, y=est_charges)

In [None]:
sns.lineplot(ns_df, x='age', y='charges')
sns.lineplot(x=ages, y=est_charges)

In [None]:
def ck_data(w, b):
  ages = ns_df.age
  est_charges = es_charges(ages, w, b)

  sns.lineplot(ns_df, x='age', y='charges')
  sns.lineplot(x=ages, y=est_charges)

In [None]:
ck_data(250, -1600)

#### Error

In [None]:
def rmse(target, prediction):
  return np.sqrt(np.mean(np.square(target-prediction)))

def mse(target, prediction):
  return np.mean(np.square(target-prediction))

In [None]:
w=50
b=100
ck_data(w, b)

In [None]:
target = ns_df.charges
prediction = es_charges(ns_df.age, w, b)
rmse(target, prediction)    # --> ERROR --> tells you how bad is your model
# on average each data is off by rmse(target, prediction)

In [None]:
def calc_loss(w, b):
  target = ns_df.charges
  prediction = es_charges(ns_df.age, w, b)

  ages = ns_df.age
  sns.lineplot(ns_df, x='age', y='charges')
  sns.lineplot(x=ages, y=prediction)

  print(rmse(target, prediction))

In [None]:
calc_loss(250, -1400)

### Optimizer

Now we have to modify w and b to reduce the loss and improve the fit of the line to the data

- Ordinary Least Squares (better for smaller datasets)
- Stochastic Gradient Descent (better for larger datasets)

Ordinary Least Squares directly computes for w and b, while  
Gradient Descent uses iterative approach, starting with a random w and b and then slowly improving them using derivative

### Linear Regression using Sklearn

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Creating model object

model = LinearRegression()

Every model have a fit method and this helps in finding the best fit line for the input and target

In [None]:
help(model.fit)

In [None]:
# input should be a 2D array --> DataFrame not Series
inputs = ns_df[['age']]
# target can be 1D
targets = ns_df.charges
inputs.shape, targets.shape

In [None]:
model.fit(inputs, targets)

In [None]:
model.predict(np.array([[18], [28], [33]]))

In [None]:
ns_df.head(3)

In [None]:
predictions = model.predict(inputs)
predictions

In [None]:
rmse(target, predictions)
# mse(target, predictions)
# model is off by 4662 on an average

In [None]:
# coef_ constain --> weights and intercept_ contain --> intercept
model.coef_, model.intercept_

> SGDRegressor

In [None]:
from sklearn.linear_model import SGDRegressor
model = SGDRegressor()

In [None]:
model.fit(inputs, targets)

In [None]:
model.coef_, model.intercept_

In [None]:
predictions = model.predict(inputs)
predictions

In [None]:
rmse(targets, predictions)

- Every time model is running it is giving different results

## Repeat all over with smoker data

In [None]:
s_df = df[df.smoker == 'yes']
s_df.head()

In [None]:
inputs = s_df[['age']]
targets = s_df.charges

In [None]:
model = LinearRegression()

In [None]:
model.fit(inputs, targets)

In [None]:
predictions = model.predict(inputs)
predictions

In [None]:
rmse(targets, predictions)

In [None]:
def calc_loss_s(w, b):
  target = s_df.charges
  prediction = es_charges(s_df.age, w, b)

  ages = s_df.age
  sns.lineplot(s_df, x='age', y='charges')
  sns.lineplot(x=ages, y=prediction)

  print(rmse(target, prediction))

In [None]:
calc_loss_s(model.coef_, model.intercept_)

## Machine Learning

Every machine learning problem has three compents:
1. Model
2. Cost Function
3. Optimizer

In [None]:
# Create inputs and targets
inputs, targets = ns_df[['age']], ns_df.charges

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evaluate the model
rmse(targets, predictions)

### Linear Regression using Multiple Features

charges = w1 * age + w2 * bmi + b

In [None]:
inputs = ns_df[['age', 'bmi']]
targets = ns_df.charges

model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)

rmse(targets, predictions)

- No such impact of BMI on charges
- There is no such relationship between BMI and charges

charges = w * bmi + b

In [None]:
inputs, targets = ns_df[['bmi']], ns_df.charges
model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
rmse(targets, predictions)

- When we are estimating charges using BMI instead of age then there is an increament of error from 4600 to 6000 which almost jump of almost 30%

charges = w1 * age + w2 * bmi + w3 * children + b

In [None]:
ns_df.charges.corr(ns_df.children)

In [None]:
sns.swarmplot(ns_df, x='children', y='charges', size=1.2)

In [None]:
sns.stripplot(ns_df, x='children', y='charges', size=2)

In [None]:
inputs, targets = ns_df[['age', 'bmi', 'children']], ns_df.charges
model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
rmse(targets, predictions)

- Slight reduction but now that much useful

In [None]:
# For smokers

inputs, targets = s_df[['age', 'bmi']], s_df.charges
model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
rmse(targets, predictions)

- There is almost 50% reduction in error while using age alnong with BMI for smokers

In [None]:
inputs, targets = s_df[['age', 'bmi', 'children']], s_df.charges
model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
rmse(targets, predictions)

- No such reduction on using children along with age and BMI

> Performing Linear Regression on whole dataset

In [None]:
df.head()

In [None]:
inputs, targets = df[['age']], df.charges
model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
rmse(targets, predictions)

In [None]:
inputs, targets = df[['age', 'bmi']], df.charges
model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
rmse(targets, predictions)

In [None]:
inputs, targets = df[['age', 'bmi', 'children']], df.charges
model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
rmse(targets, predictions)

- Whene determining charges for whole dataset using age, BMI, children is a huge error in calculating the charges
- Even combinations of these things are not making the fitting better
- This is due to the variations of charges from smokers to non smokers

In [None]:
sns.scatterplot(df, x='age', y='charges', hue='smoker')

### Now we perform Linear Regression on categorical Data

1. If a categorical column has just two categories --> then go for label them with 0's and 1's
2. If the categorical column has more than two categorical values --> then go for one-hot encoding
3. If the categorical column has natural order ( like cold, neutral, warm, hot ) --> give them order to preserve the order ( like 1, 2, 3, 4 for cold, neutral, warm, hot respectively )

#### Binary Categories

In [None]:
sns.countplot(df, x='smoker')

In [None]:
smoker_code = {'yes': 1, 'no': 0}
df['smoker_code'] = df.smoker.map(smoker_code)

In [None]:
df.head()

In [None]:
df.charges.corr(df.smoker_code)

charges = w1 * age + w2 * bmi + w3 * children + w4 * smoker + b

In [None]:
inputs, targets = df[['age', 'bmi', 'children', 'smoker_code']], df.charges
model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
rmse(targets, predictions)

- Error has been reduced significantly after introdusing smoker as a numerical value --> almost 45% reduction

> Never ignore categorical data it may be a deciding factor

In [None]:
sex_code = {'male': 1, 'female': 0}
df['sex_code'] = df.sex.map(sex_code)
df.head()

In [None]:
df.charges.corr(df.sex_code)
# Almost no correlation

In [None]:
inputs, targets = df[['age', 'bmi', 'children', 'smoker_code','sex_code']], df.charges
model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
rmse(targets, predictions)

- Sex has negligible effect on charges

In [None]:
df.region.value_counts()

In [None]:
from sklearn import preprocessing
encoder = preprocessing.OneHotEncoder()
encoder.fit(df[['region']])
encoder.categories_

In [None]:
encoder.transform([['northeast'], ['southeast']]).toarray()
# it will transform according to the order of categories it have

In [None]:
one_hot = encoder.transform(df[['region']]).toarray()
one_hot

In [None]:
df[['northeast', 'northwest', 'southeast', 'ssouthwest']] = one_hot

In [None]:
df.head()

charges = w1 * age + w2 * bmi + w3 * children + w4 * smoker + w5 * sex + (w6 + w7 + w8 + w9) * region + b

In [None]:
inputs, targets = df[['age', 'bmi', 'children', 'smoker_code','sex_code', 'northeast', 'northwest', 'southeast', 'ssouthwest']], df.charges
model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
rmse(targets, predictions)

- Slight reduction in error after introducing region into figure
- Age and Smoker are doing much of the predictions

In [None]:
# Only for smoker using region after one-hot encoding

sm_df = df[df.smoker == 'yes']
inputs, targets = sm_df[['age', 'bmi', 'children', 'smoker_code','sex_code', 'northeast', 'northwest', 'southeast', 'ssouthwest']], sm_df.charges
model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
rmse(targets, predictions)

- Slight reduction after introducing region into smoker dataframe

In [None]:
nsm_df = df[df.smoker == 'no']
inputs, targets = nsm_df[['age', 'bmi', 'children', 'smoker_code','sex_code', 'northeast', 'northwest', 'southeast', 'ssouthwest']], nsm_df.charges
model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
rmse(targets, predictions)

- Slight reduction in non smoker error after introducing region into figure

> Sometime multiple regression models are better to use than one

In [None]:
model.predict([[28, 30, 2, 0, 0, 0, 1, 0, 0]])

### Model Improvement

#### Feature Scaling

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
input_cols = ['age', 'bmi', 'children', 'smoker_code','sex_code', 'northeast', 'northwest', 'southeast', 'ssouthwest']
inputs, targets = df[input_cols], df.charges

model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
rmse(targets, predictions)

In [None]:
model.predict([[28, 30, 2, 1, 0, 0, 1, 0, 0]])

In [None]:
weights_df = pd.DataFrame({
  'features': np.append(input_cols, 1),
  'weght': np.append(model.coef_, model.intercept_)
})
weights_df

- We can see the uneven distribution of weights over age, bmi, children, sex_code, and regions
- BMI have more weight than age, similarly with many other weights
- Weight is not in sync with data
- So we can't say which feature is more important and which one is less

> So we will standardize the features

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
numeric_cols = ['age', 'bmi', 'children']
scaler = StandardScaler()
scaler.fit(df[numeric_cols])

In [None]:
scaler.mean_, scaler.var_

In [None]:
scaled_inputs = scaler.transform(df[numeric_cols])
scaled_inputs
# After standardization data will come in between -1 and 1

In [None]:
cat_cols = ['smoker_code','sex_code', 'northeast', 'northwest', 'southeast', 'ssouthwest']
cat_data = df[cat_cols].values
cat_data

In [None]:
inputs = np.concatenate((scaled_inputs, cat_data), axis=1)
model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
rmse(targets, predictions)

In [None]:
inputs[0]

In [None]:
weights_df = pd.DataFrame({
  'features': np.append(input_cols, 1),
  'weght': np.append(model.coef_, model.intercept_)
})
weights_df

In [None]:
new_customer = [[28, 30, 2, 1, 0, 0, 1, 0, 0]]
scaler.transform([[28, 30, 2]])

In [None]:
model.predict([[-0.79795355, -0.10882659,  0.75107928, 1, 0, 0, 1, 0, 0]])

- So most important features are:
1. Smoker
2. Age
3. BMI