In [1]:
# Code for Simple Linear Regression

In [2]:
import statsmodels.api as sm
from sklearn import datasets

data = datasets.load_boston()

In [3]:
print (data.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [4]:
# Pandas and NumPy import
import numpy as np
import pandas as pd

# Set the features  
df = pd.DataFrame(data.data, columns=data.feature_names)

# Set the target
target = pd.DataFrame(data.target, columns=["MEDV"])

In [5]:
X = df["RM"]
y = target["MEDV"]

# Fit and make the predictions by the model
model = sm.OLS(y, X).fit()
predictions = model.predict(X)

# Print out the statistics
model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared (uncentered):,0.901
Model:,OLS,Adj. R-squared (uncentered):,0.901
Method:,Least Squares,F-statistic:,4615.0
Date:,"Tue, 08 Oct 2019",Prob (F-statistic):,3.7399999999999996e-256
Time:,20:49:37,Log-Likelihood:,-1747.1
No. Observations:,506,AIC:,3496.0
Df Residuals:,505,BIC:,3500.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RM,3.6534,0.054,67.930,0.000,3.548,3.759

0,1,2,3
Omnibus:,83.295,Durbin-Watson:,0.493
Prob(Omnibus):,0.0,Jarque-Bera (JB):,152.507
Skew:,0.955,Prob(JB):,7.649999999999999e-34
Kurtosis:,4.894,Cond. No.,1.0


In [6]:
from sklearn import linear_model
X = df["RM"]
y = target["MEDV"]

m=pd.concat([X,y],axis=1)

m.head()


Unnamed: 0,RM,MEDV
0,6.575,24.0
1,6.421,21.6
2,7.185,34.7
3,6.998,33.4
4,7.147,36.2


In [7]:
lm = linear_model.LinearRegression()
feature_cols = ['RM']
p = m[feature_cols]
q = m.MEDV
model = lm.fit(p,q)

In [8]:
lm.score(p,q)

0.4835254559913343

In [9]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error

# Split data
p_train, p_test, q_train, q_test = train_test_split(p, q, random_state=1)

# Instantiate model
lm2 = linear_model.LinearRegression()

# Fit Model
lm2.fit(p_train, q_train)

# Predict
q_pred = lm2.predict(p_test)

# MSE
print(metrics.mean_squared_error(q_test, q_pred))

# RMSE
print(np.sqrt(metrics.mean_squared_error(q_test, q_pred)))

39.183640132173
6.2596837086368025
