In [29]:
import pandas as pd

df = pd.read_csv("homeprices6.csv")
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [30]:
# Encode categories using pandas.get_dummies()
# create dummy columns based on the 'town' column
dummies = pd.get_dummies(df.town)
# merge the dummy columns into the dataframe
tmp = pd.concat([df, dummies], axis='columns')
tmp

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [32]:
# drop the 'town' column
# also drop one dummy column, to avoid the "dummy variable trap"
tmp = tmp.drop(['town', 'west windsor'], axis='columns')
tmp

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [33]:
X = tmp.drop(['price'], axis='columns')
X

Unnamed: 0,area,monroe township,robinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,0,1


In [34]:
y = df.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [35]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [36]:
# predict the price of a house with 2800 sq ft in Robinsville
model.predict([[2800, 0, 1]])

array([590775.63964739])

In [37]:
# predict the price of a house with 3400 sq ft in West Winsor
model.predict([[3400, 0, 0]])

array([681241.66845839])

In [38]:
# how accurate is the model?
model.score(X, y)

0.9573929037221873

In [39]:
# Encode categories using sklearn.preprocessing LabelEncoder, OneHotEncoder and ColumnTransformer
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
tmp = df
tmp.town = label.fit_transform(df.town)
tmp

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [40]:
# extract X as a 2d array and not a dataframe
X = tmp[['town', 'area']].values
X

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]])

In [41]:
y = tmp.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [45]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# formerly:
# onehot = OneHotEncoder(categorical_features=[0])
# X = onehot.fit_transform(X).toarray()

# now:
ct = ColumnTransformer([('any_name', OneHotEncoder(), [0])], remainder='passthrough')
X = ct.fit_transform(X)
# X = np.array(ct2.fit_transform(X), dtype=np.float)
X

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

In [46]:
# drop the first column from 
X = X[:, 1:]
X

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [49]:
# now train the model
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [50]:
# predict a 2800 sq ft house in Robinsville (column 0)
model.predict([[1, 0, 2800]])

array([590775.63964739])

In [51]:
# predict a 3400 sq ft house in West Winsor (column 1)
model.predict([[0, 1, 3400]])

array([681241.6684584])

## Exercise
At the same level as this notebook on github, there is an Exercise folder that contains carprices.csv. This file has car sell prices for 3 different models. First plot data points on a scatter plot chart to see if linear regression model can be applied. If yes, then build a model that can answer following questions,

**1) Predict price of a mercedez benz that is 4 yr old with mileage 45000**  
**2) Predict price of a BMW X5 that is 7 yr old with mileage 86000**  
**3) Tell me the score (accuracy) of your model. (Hint: use LinearRegression().score())**

In [1]:
import pandas as pd

df2 = pd.read_csv("carprices6.csv")
df2

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [2]:
# extract X and y
y = df2['Sell Price($)']
X = df2.drop(['Sell Price($)'], axis='columns')
print("y:", y)
print("X:", X)

y: 0     18000
1     34000
2     26100
3     40000
4     31500
5     29400
6     32000
7     19300
8     12000
9     22000
10    20000
11    21000
12    33000
Name: Sell Price($), dtype: int64
X:                 Car Model  Mileage  Age(yrs)
0                  BMW X5    69000         6
1                  BMW X5    35000         3
2                  BMW X5    57000         5
3                  BMW X5    22500         2
4                  BMW X5    46000         4
5                 Audi A5    59000         5
6                 Audi A5    52000         5
7                 Audi A5    72000         6
8                 Audi A5    91000         8
9   Mercedez Benz C class    67000         6
10  Mercedez Benz C class    83000         7
11  Mercedez Benz C class    79000         7
12  Mercedez Benz C class    59000         5


In [3]:
# Encode categories using sklearn.preprocessing LabelEncoder, OneHotEncoder and ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

label = LabelEncoder()
# X['Car Model'] = label.fit_transform(X['Car Model'])

ct = ColumnTransformer([('any_name', OneHotEncoder(), [0])], remainder='passthrough')
X = ct.fit_transform(X)
X

array([[0.00e+00, 1.00e+00, 0.00e+00, 6.90e+04, 6.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 3.50e+04, 3.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 5.70e+04, 5.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 2.25e+04, 2.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 4.60e+04, 4.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 5.90e+04, 5.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 5.20e+04, 5.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 7.20e+04, 6.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 9.10e+04, 8.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 6.70e+04, 6.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 8.30e+04, 7.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 7.90e+04, 7.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 5.90e+04, 5.00e+00]])

In [4]:
from sklearn.linear_model import LinearRegression

model2 = LinearRegression()
model2.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [5]:
# predict price of a Mercedes Benz, 4 years old, 45,000 miles
model2.predict([[0, 0, 1, 45000, 4]])

array([36991.31721061])

In [6]:
# predict price of a BMW X5, 7 years old, 86,000 miles
model2.predict([[0, 1, 0, 86000, 7]])

array([11080.7431322])

In [7]:
model2.score(X, y)

0.9417050937281083