## Sklearn is a massive library for models usually useful for AI too

In [1]:
import pandas as pd
import seaborn as sns
from sklearn import linear_model

In [5]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


### Create and fit linear regression model

In [7]:
linear_reg = linear_model.LinearRegression()

In [8]:
# X is predictor / independent var
# y is single vector, response value / dependent var

linear_reg.fit(X=tips[['total_bill','size']], y = tips['tip'])

LinearRegression()

### Look for the coefficient

In [11]:
linear_reg.coef_

# For every 1 dollar in total_bill, the tip increase by 9 cents
# For every 1 person increase in size, the tip increase by 19 cents

array([0.09271334, 0.19259779])

### Look for intercept (not often)

In [12]:
linear_reg.intercept_

# for total_bill = 0 and no one shows up, the tip by default 67 cents

0.6689447408125022

### Since the machine learning in model only can read the numeric value, so something like 'sex' column which is categorical might return error


In [14]:
# So we want to convert the categorical into numerical

# we can use dummy encoding
# one-hot encoding

In [15]:
pd.get_dummies(tips)

# here we convert everything into numbers

Unnamed: 0,total_bill,tip,size,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,16.99,1.01,2,0,1,0,1,0,0,0,1,0,1
1,10.34,1.66,3,1,0,0,1,0,0,0,1,0,1
2,21.01,3.50,3,1,0,0,1,0,0,0,1,0,1
3,23.68,3.31,2,1,0,0,1,0,0,0,1,0,1
4,24.59,3.61,4,0,1,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,1,0,0,1,0,0,1,0,0,1
240,27.18,2.00,2,0,1,1,0,0,0,1,0,0,1
241,22.67,2.00,2,1,0,1,0,0,0,1,0,0,1
242,17.82,1.75,2,1,0,0,1,0,0,1,0,0,1


## we would like to drop the similar value that gives similar info (e.g: male/female)

In [18]:
tips_dummy = pd.get_dummies(tips, drop_first = True)
tips_dummy.head()

Unnamed: 0,total_bill,tip,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1.01,2,1,1,0,0,1,1
1,10.34,1.66,3,0,1,0,0,1,1
2,21.01,3.5,3,0,1,0,0,1,1
3,23.68,3.31,2,0,1,0,0,1,1
4,24.59,3.61,4,1,1,0,0,1,1


### Now we want to show the correlation between tip with 3rd columns (size) on tips_dummy to the end (time_dinner)

In [19]:
linear_reg = linear_model.LinearRegression()
linear_reg.fit(X=tips_dummy.iloc[:,2:], y = tips_dummy['tip'])

LinearRegression()

In [20]:
linear_reg.coef_

array([ 0.71001644, -0.10057881, -0.20916402, -0.20180568, -0.36603136,
       -0.29452609,  0.48575489])

### intepret the time_dinner column

In [None]:
# Assuming everythng is the same, when having dinner time the tips increase by 49 cents