# Linear Regression

In [23]:
import pandas as pd
import numpy as np

In [8]:
housing = pd.read_csv('../Data/kc_house.csv')
housing = housing[['sqft_living', 'price']]
housing

Unnamed: 0,sqft_living,price
0,3.25,95.100
1,2.15,65.000
2,0.76,36.990
3,1.97,55.000
4,3.65,238.400
...,...,...
336,2.04,61.500
337,6.49,200.000
338,2.02,122.500
339,4.27,135.692


In [19]:
X = housing['sqft_living']
X_reshaped = X.values.reshape(-1,1)
y = housing['price']

In [31]:
from sklearn.linear_model import LinearRegression
# X.values.reshape(-1,1) converts from a series to a dataframe which is required as the argument for X
linear_regression = LinearRegression().fit(X=X_reshaped, y=housing['price'])
y_pred = linear_regression.predict(X_reshaped)

In [32]:
print('Coefficient: ', linear_regression.coef_)
print('Intercept: ', linear_regression.intercept_)

Coefficient:  [50.95244905]
Intercept:  -33.982222424899746


In [33]:
linear_regression.score(X_reshaped, y)

0.6210689765309945

# Mean Squared Error

<img src="../Images/mean_square_error.jpeg" alt="Alternative text" height="300"/>

Mean Squred Error is also known as the <b>cost function</b>

In [36]:
from sklearn import metrics
print('Mean Squared Error:', metrics.mean_squared_error(y, y_pred))

Mean Squared Error: 2419.0341113521495


# Residual Sum of Squares(RSS)

In [30]:
y_pred = linear_regression.predict(X_reshaped)
df = pd.DataFrame({'Actual': y, 'Predicted':y_pred})
print('Residual Sum of Squares(RSS): '+ str(np.sum(np.square(df['Predicted'] - df['Actual']))))

Residual Sum of Squares(RSS): 824890.631971083


# Stats Model Library

In [28]:
import statsmodels.api as sm
# Add a constant to get an intercept
X_sm = sm.add_constant(X)
linear_regression_sm = sm.OLS(y, X_sm).fit()
linear_regression_sm.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.621
Model:,OLS,Adj. R-squared:,0.62
Method:,Least Squares,F-statistic:,555.6
Date:,"Fri, 03 Feb 2023",Prob (F-statistic):,2.02e-73
Time:,16:26:02,Log-Likelihood:,-1812.2
No. Observations:,341,AIC:,3628.0
Df Residuals:,339,BIC:,3636.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-33.9822,6.956,-4.885,0.000,-47.664,-20.300
sqft_living,50.9524,2.162,23.572,0.000,46.701,55.204

0,1,2,3
Omnibus:,37.557,Durbin-Watson:,2.062
Prob(Omnibus):,0.0,Jarque-Bera (JB):,59.106
Skew:,0.695,Prob(JB):,1.46e-13
Kurtosis:,4.492,Cond. No.,9.08
