In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import pandas as pd
import seaborn as sns

In [3]:
%matplotlib inline

In [4]:
df = pd.read_csv('50_Startups.csv')

In [5]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


## Dummies the Category Variable

In [6]:
df = pd.get_dummies(df)

In [7]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1,0
3,144372.41,118671.85,383199.62,182901.99,0,0,1
4,142107.34,91391.77,366168.42,166187.94,0,1,0


## Train Test Split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X = df.drop('Profit',axis=1)
y = df['Profit']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Create Model

In [11]:
from sklearn.linear_model import LinearRegression

In [12]:
model = LinearRegression()

In [13]:
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [14]:
prediction = model.predict(X_test)

In [15]:
d= {'Predic_Value':prediction,'True_Value':y_test}
pred_df = pd.DataFrame(d)

In [16]:
pred_df.head()

Unnamed: 0,Predic_Value,True_Value
13,126187.358759,134307.35
39,85719.45625,81005.76
30,99648.034932,99937.59
45,45664.955053,64926.08
17,127106.84249,125370.37


In [17]:
model.predict([[162597.70,91391.77,366168.42,0,0,1]])

array([188960.24720989])

## Backward elimination (นำข้อมูลที่ไม่จำเป็นออก)
    - ถ้า p-value น้อยกว่าค่า SL(Significant Level) ค่านั้นจะยังอยู่ใน Model
    - ถ้า p-value มากกว่าค่า SL(Significant Level) ค่านั้นจะถูกนำออก

In [18]:
import statsmodels.formula.api as sm

In [19]:
X = np.append(arr = np.ones((50,1)).astype(int), values= X ,axis=1)

In [20]:
X_OLS = X[:,[0,1,2,3,4,5]]

In [21]:
model_OLS = sm.OLS(endog=y,exog=X_OLS).fit()

In [22]:
model_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Sun, 31 Mar 2019",Prob (F-statistic):,1.34e-27
Time:,21:11:50,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.008e+04,6952.587,7.204,0.000,3.61e+04,6.41e+04
x1,0.8060,0.046,17.369,0.000,0.712,0.900
x2,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x3,0.0270,0.017,1.574,0.123,-0.008,0.062
x4,41.8870,3256.039,0.013,0.990,-6520.229,6604.003
x5,240.6758,3338.857,0.072,0.943,-6488.349,6969.701

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1470000.0


### จากผลสรุปด้านบน index ที่ P>|t| มีค่ามากกว่า0.05คือ x2,x3,x4,x5 ฉนั้น เราต้องนำออก

In [23]:
X_OLS = X[:,[0,1]]
model_OLS = sm.OLS(endog=y,exog=X_OLS).fit()
model_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Sun, 31 Mar 2019",Prob (F-statistic):,3.5000000000000004e-32
Time:,21:11:50,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0
