In [1]:
# Step 1: Importing the important libraries
"""
1. NumPy: is a general-purpose array-processing package and scientific computing package.
2. Pandas: is an open source package widely used for data science/data analysis and machine learning tasks.
3. Matplotlib: is a cross-platform, data visualization and graphical plotting library and its numerical extension NumPy.
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Step 2: Creation of dataset with multiple variables using 2D-array function 
raw_data = {'area': [2600, 3000, 3200, 3600, 4000],
            'badrooms': [3, 4, np.NaN, 3, 5],
            'age': [20, 15, 18, 30, 8],
            'price': [550000, 565000, 610000, 595000, 760000]}

In [3]:
# Step 2.1: Converting the raw dataset into the dataframe using pandas function
df = pd.DataFrame(raw_data, columns = ['area','badrooms','age','price'])

In [4]:
# Step 2.1: Exploration of the dataset
df

Unnamed: 0,area,badrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000


In [5]:
# Step 2.1: Exploration of the dataset
# Check the decoration of the dataset
df.columns

Index(['area', 'badrooms', 'age', 'price'], dtype='object')

In [6]:
# Step 2.1: Exploration of the dataset
# Check the decoration of the dataset
df.describe()

Unnamed: 0,area,badrooms,age,price
count,5.0,4.0,5.0,5.0
mean,3280.0,3.75,18.2,616000.0
std,540.370243,0.957427,8.01249,83919.604384
min,2600.0,3.0,8.0,550000.0
25%,3000.0,3.0,15.0,565000.0
50%,3200.0,3.5,18.0,595000.0
75%,3600.0,4.25,20.0,610000.0
max,4000.0,5.0,30.0,760000.0


In [7]:
# Step 3: Variable - Badrooms have NaN values, replace these NaN values with median of the variable.
import math
med_bedroom = math.floor(df.badrooms.median())
med_bedroom

3

In [8]:
# Step 4: Filled up the NaN values with median value.
df.badrooms=df.badrooms.fillna(med_bedroom)

In [9]:
# Step 4.1: Re-explore the dataset
df

Unnamed: 0,area,badrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,3.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000


In [10]:
# Step 5: Seperating the dataset into the dependent (y) and independent (x) varialbes
x = df[['area','badrooms','age']]
y = df.price

In [11]:
print(x,y)

   area  badrooms  age
0  2600       3.0   20
1  3000       4.0   15
2  3200       3.0   18
3  3600       3.0   30
4  4000       5.0    8 0    550000
1    565000
2    610000
3    595000
4    760000
Name: price, dtype: int64


In [12]:
# Step 5: Initializing the Linear Regression
from sklearn import linear_model
reg = linear_model.LinearRegression()

In [13]:
# Step 6: Fitting the Linear Regression to the dataset
reg.fit(x,y)

LinearRegression()

In [14]:
# Step 7: Computing the coefficient and intercept values of regression line
reg.coef_, reg.intercept_

(array([   137.25, -26025.  ,  -6825.  ]), 383724.99999999994)

In [15]:
# Step 8: Predicting the dependent variable value over the independent variable
reg.predict([[3000,3,40]])

array([444400.])

In [16]:
# Step 8: Conti...
reg.predict([[2400,2,56]])

array([278875.])

In [17]:
# It is an example of multiple linear regression: is a linear approach to modelling the relationship between a scalar response and one or more explanatory variables (also known as dependent and independent variables)