#### Import pandas library

In [1]:
import pandas as pd 

#### load dataset

In [2]:
df = pd.read_csv('/Dataset/house_price.csv')
df

Unnamed: 0,Zone,Area(sq-ft),Price(Taka)
0,Dhaka,2104,399900
1,Dhaka,1600,329900
2,Dhaka,2400,369000
3,Dhaka,1416,232000
4,Dhaka,3000,539900
5,Chittagong,1985,299900
6,Chittagong,1534,314900
7,Chittagong,1427,198999
8,Chittagong,1380,212000
9,Chittagong,1494,242500


In [3]:
df.dtypes

Zone           object
Area(sq-ft)     int64
Price(Taka)     int64
dtype: object

In [4]:
df.Zone.unique()

array(['Dhaka', 'Chittagong', 'Khulna'], dtype=object)

Our Zone columns is object type data

#### From categorical variables to dummy variables via pandas

In [5]:
dummies = pd.get_dummies(df.Zone)
dummies

Unnamed: 0,Chittagong,Dhaka,Khulna
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
5,1,0,0
6,1,0,0
7,1,0,0
8,1,0,0
9,1,0,0


#### Pair df and dummies using the concat method of pandas

In [6]:
df_dummies = pd.concat([df,dummies],axis='columns')
df_dummies

Unnamed: 0,Zone,Area(sq-ft),Price(Taka),Chittagong,Dhaka,Khulna
0,Dhaka,2104,399900,0,1,0
1,Dhaka,1600,329900,0,1,0
2,Dhaka,2400,369000,0,1,0
3,Dhaka,1416,232000,0,1,0
4,Dhaka,3000,539900,0,1,0
5,Chittagong,1985,299900,1,0,0
6,Chittagong,1534,314900,1,0,0
7,Chittagong,1427,198999,1,0,0
8,Chittagong,1380,212000,1,0,0
9,Chittagong,1494,242500,1,0,0


#### Now no need to Zone columns

In [7]:
df_dummies.drop('Zone',axis='columns',inplace=True)
df_dummies

Unnamed: 0,Area(sq-ft),Price(Taka),Chittagong,Dhaka,Khulna
0,2104,399900,0,1,0
1,1600,329900,0,1,0
2,2400,369000,0,1,0
3,1416,232000,0,1,0
4,3000,539900,0,1,0
5,1985,299900,1,0,0
6,1534,314900,1,0,0
7,1427,198999,1,0,0
8,1380,212000,1,0,0
9,1494,242500,1,0,0


#### Now we will omit the Khulna column on the far right. Because if Chittagong = 0, and Dhaka = 0, we can say that the region will be Khulna. This will allow the linear regression to work properly.

In [8]:
df_dummies.drop('Khulna', axis='columns', inplace=True)
df_dummies

Unnamed: 0,Area(sq-ft),Price(Taka),Chittagong,Dhaka
0,2104,399900,0,1
1,1600,329900,0,1
2,2400,369000,0,1
3,1416,232000,0,1
4,3000,539900,0,1
5,1985,299900,1,0
6,1534,314900,1,0
7,1427,198999,1,0
8,1380,212000,1,0
9,1494,242500,1,0


#### Model setup

In [9]:
X = df_dummies.drop('Price(Taka)',axis='columns')
y = df_dummies['Price(Taka)']

In [10]:
X

Unnamed: 0,Area(sq-ft),Chittagong,Dhaka
0,2104,0,1
1,1600,0,1
2,2400,0,1
3,1416,0,1
4,3000,0,1
5,1985,1,0
6,1534,1,0
7,1427,1,0
8,1380,1,0
9,1494,1,0


In [11]:
y 

0     399900
1     329900
2     369000
3     232000
4     539900
5     299900
6     314900
7     198999
8     212000
9     242500
10    239999
11    347000
12    329999
13    699900
Name: Price(Taka), dtype: int64

##### Import sklearn Linear Regression Library

In [12]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [13]:
lr.fit(X, y)

LinearRegression()

In [14]:
lr.predict([[2700,0,0]]) # 2700 sqr ft home in Khulna

array([423589.31409093])

In [15]:
lr.score(X, y)

0.9232329089668183