### Categorical Variables and One Hot Encoding

In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv('homeprices.csv')
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [5]:
df.replace(to_replace = ['monroe township', 'west windsor','robinsville'], value = ['nyc','dc','nc'], inplace= True)
df

Unnamed: 0,town,area,price
0,nyc,2600,550000
1,nyc,3000,565000
2,nyc,3200,610000
3,nyc,3600,680000
4,nyc,4000,725000
5,dc,2600,585000
6,dc,2800,615000
7,dc,3300,650000
8,dc,3600,710000
9,nc,2600,575000


- cheapest location --> nyc 
- most expensive --> dc 

### Using Pandas to Create Dummy Variables

In [7]:
dummies = pd.get_dummies(df.town)
dummies

# for any pairs:
# (0,0,1) > nyc 
# (1,0,0) > dc 
# (0,1,0) > nc

Unnamed: 0,dc,nc,nyc
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
5,1,0,0
6,1,0,0
7,1,0,0
8,1,0,0
9,0,1,0


In [8]:
merged_df = pd.concat([df,dummies],axis = 'columns')
merged_df

Unnamed: 0,town,area,price,dc,nc,nyc
0,nyc,2600,550000,0,0,1
1,nyc,3000,565000,0,0,1
2,nyc,3200,610000,0,0,1
3,nyc,3600,680000,0,0,1
4,nyc,4000,725000,0,0,1
5,dc,2600,585000,1,0,0
6,dc,2800,615000,1,0,0
7,dc,3300,650000,1,0,0
8,dc,3600,710000,1,0,0
9,nc,2600,575000,0,1,0


In [9]:
merged_df.describe()

Unnamed: 0,area,price,dc,nc,nyc
count,13.0,13.0,13.0,13.0,13.0
mean,3146.153846,629230.769231,0.307692,0.307692,0.384615
std,453.900475,57621.109914,0.480384,0.480384,0.50637
min,2600.0,550000.0,0.0,0.0,0.0
25%,2800.0,585000.0,0.0,0.0,0.0
50%,3100.0,615000.0,0.0,0.0,0.0
75%,3600.0,680000.0,1.0,1.0,1.0
max,4000.0,725000.0,1.0,1.0,1.0


In [13]:
final_df = merged_df.drop(['town'],axis='columns')
final_df

Unnamed: 0,area,price,dc,nc,nyc
0,2600,550000,0,0,1
1,3000,565000,0,0,1
2,3200,610000,0,0,1
3,3600,680000,0,0,1
4,4000,725000,0,0,1
5,2600,585000,1,0,0
6,2800,615000,1,0,0
7,3300,650000,1,0,0
8,3600,710000,1,0,0
9,2600,575000,0,1,0


In [16]:
final_df = final_df.drop(['nc'],axis = 'columns')
final_df

Unnamed: 0,area,price,dc,nyc
0,2600,550000,0,1
1,3000,565000,0,1
2,3200,610000,0,1
3,3600,680000,0,1
4,4000,725000,0,1
5,2600,585000,1,0
6,2800,615000,1,0
7,3300,650000,1,0
8,3600,710000,1,0
9,2600,575000,0,0


In [17]:
x = final_df.drop('price', axis = 'columns')
x

Unnamed: 0,area,dc,nyc
0,2600,0,1
1,3000,0,1
2,3200,0,1
3,3600,0,1
4,4000,0,1
5,2600,1,0
6,2800,1,0
7,3300,1,0
8,3600,1,0
9,2600,0,0


In [18]:
y = final_df.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [20]:
import sklearn.linear_model 
model = sklearn.linear_model.LinearRegression().fit(x,y)

In [21]:
model.predict(x)

array([539709.73984091, 590468.71640508, 615848.20468716, 666607.18125133,
       717366.1578155 , 579723.71533005, 605103.20361213, 668551.92431735,
       706621.15674048, 565396.15136531, 603465.38378844, 628844.87207052,
       692293.59277574])

In [22]:
model.score(x,y)

0.9573929037221873

In [24]:
model.predict([[2000,0,0]]) # 2000 sq ft home in nc

array([489257.68651906])

In [25]:
model.predict([[2000,0,1]]) # 2000 sq ft home in nyc 

array([463571.27499465])

In [26]:
model.predict([[2000,1,0]]) # 2000 sq ft home in dc

array([503585.25048379])