# categorical variables and one hot encoding

In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('hhome.csv')
df

Unnamed: 0,Town,Area,Price
0,monoroe towership,2600,550000
1,monoroe towership,3000,565000
2,monoroe towership,3200,610000
3,monoroe towership,3600,680000
4,monoroe towership,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,roobinsville,2600,575000


# using pandas to create dummy variables

In [3]:
dummies=pd.get_dummies(df.Town)
dummies

Unnamed: 0,monoroe towership,roobinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [4]:
merged=pd.concat([df,dummies],axis='columns')
merged

Unnamed: 0,Town,Area,Price,monoroe towership,roobinsville,west windsor
0,monoroe towership,2600,550000,1,0,0
1,monoroe towership,3000,565000,1,0,0
2,monoroe towership,3200,610000,1,0,0
3,monoroe towership,3600,680000,1,0,0
4,monoroe towership,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,roobinsville,2600,575000,0,1,0


In [5]:
final=merged.drop(['Town'],axis='columns')
final

Unnamed: 0,Area,Price,monoroe towership,roobinsville,west windsor
0,2600,550000,1,0,0
1,3000,565000,1,0,0
2,3200,610000,1,0,0
3,3600,680000,1,0,0
4,4000,725000,1,0,0
5,2600,585000,0,0,1
6,2800,615000,0,0,1
7,3300,650000,0,0,1
8,3600,710000,0,0,1
9,2600,575000,0,1,0


#Dummy Variable Trap
When you can derive one variable from other variables, they are known to be multi-colinear. Here if you know values of california and georgia then you can easily infer value of new jersey state, i.e. california=0 and georgia=0. There for these state variables are called to be multi-colinear. In this situation linear regression won't work as expected. Hence you need to drop one column.

NOTE: sklearn library takes care of dummy variable trap hence even if you don't drop one of the state columns it is going to work, however we should make a habit of taking care of dummy variable trap ourselves just in case library that you are using is not handling this for you

In [6]:
final=final.drop(['west windsor'],axis='columns')
final

Unnamed: 0,Area,Price,monoroe towership,roobinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [7]:
x=final.drop('Price',axis='columns')
x

Unnamed: 0,Area,monoroe towership,roobinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,0,1


In [8]:
y=final.Price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: Price, dtype: int64

In [9]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()

In [10]:
model.fit(x,y)

In [11]:
model.predict(x)

array([559212.75445866, 598499.36948298, 618142.67699514, 657429.29201946,
       696715.90704377, 593347.14465862, 612990.45217078, 662098.72095118,
       691563.68221942, 553748.42370744, 681429.92253648, 602856.69248784,
       651964.96126824])

In [12]:
model.score(x,y)

0.6894966035485326

In [13]:
model.predict([[3400,0,0]])#3400 sqr ft home in west



array([671920.37470726])

In [14]:
model.predict([[2800,0,1]]) #2800 sq ft home in robbins



array([573391.7312196])

## exercise

In [15]:
import pandas as pd

In [16]:
df=pd.read_csv('carprices.csv')

In [17]:
df

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [18]:
dummies=pd.get_dummies(df['Car Model'])
dummies

Unnamed: 0,Audi A5,BMW X5,Mercedez Benz C class
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
5,1,0,0
6,1,0,0
7,1,0,0
8,1,0,0
9,0,0,1


In [19]:
merged=pd.concat([df,dummies],axis='columns')
merged

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5,Mercedez Benz C class
0,BMW X5,69000,18000,6,0,1,0
1,BMW X5,35000,34000,3,0,1,0
2,BMW X5,57000,26100,5,0,1,0
3,BMW X5,22500,40000,2,0,1,0
4,BMW X5,46000,31500,4,0,1,0
5,Audi A5,59000,29400,5,1,0,0
6,Audi A5,52000,32000,5,1,0,0
7,Audi A5,72000,19300,6,1,0,0
8,Audi A5,91000,12000,8,1,0,0
9,Mercedez Benz C class,67000,22000,6,0,0,1


In [20]:
final=merged.drop(['Car Model'],axis='columns')

In [21]:
final

Unnamed: 0,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5,Mercedez Benz C class
0,69000,18000,6,0,1,0
1,35000,34000,3,0,1,0
2,57000,26100,5,0,1,0
3,22500,40000,2,0,1,0
4,46000,31500,4,0,1,0
5,59000,29400,5,1,0,0
6,52000,32000,5,1,0,0
7,72000,19300,6,1,0,0
8,91000,12000,8,1,0,0
9,67000,22000,6,0,0,1


In [22]:
final=final.drop(['Mercedez Benz C class'],axis='columns')

In [23]:
final

Unnamed: 0,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5
0,69000,18000,6,0,1
1,35000,34000,3,0,1
2,57000,26100,5,0,1
3,22500,40000,2,0,1
4,46000,31500,4,0,1
5,59000,29400,5,1,0
6,52000,32000,5,1,0
7,72000,19300,6,1,0
8,91000,12000,8,1,0
9,67000,22000,6,0,0


In [24]:
x=final.drop(['Sell Price($)'],axis='columns')
x

Unnamed: 0,Mileage,Age(yrs),Audi A5,BMW X5
0,69000,6,0,1
1,35000,3,0,1
2,57000,5,0,1
3,22500,2,0,1
4,46000,4,0,1
5,59000,5,1,0
6,52000,5,1,0
7,72000,6,1,0
8,91000,8,1,0
9,67000,6,0,0


In [25]:
y=final['Sell Price($)']
y

0     18000
1     34000
2     26100
3     40000
4     31500
5     29400
6     32000
7     19300
8     12000
9     22000
10    20000
11    21000
12    33000
Name: Sell Price($), dtype: int64

In [26]:
from sklearn.linear_model import LinearRegression
model= LinearRegression()

In [27]:
model.fit(x,y)

In [28]:
model.predict(x)

array([18705.2723644 , 35286.78445645, 24479.19112468, 41245.76426391,
       29882.98779056, 28023.6135243 , 30614.46818502, 21879.57266964,
       12182.34562104, 26183.72387884, 18929.31674102, 20409.80511857,
       30477.15426156])

In [29]:
model.score(x,y)

0.9417050937281083

### price of mercedez benz that is 4 yr old with mileage 45000

In [35]:
model.predict([[45000,4,0,0]])



array([36991.31721061])

### price of BMW X5 that is 7 yr old with mileage 86000

In [36]:
model.predict([[86000,7,0,1]])



array([11080.74313219])