In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Cách 1: Using Dummies from Pandas

### Prepare Data

In [2]:
df = pd.read_csv('homeprices.csv')
print(df)

               town  area   price
0   monroe township  2600  550000
1   monroe township  3000  565000
2   monroe township  3200  610000
3   monroe township  3600  680000
4   monroe township  4000  725000
5      west windsor  2600  585000
6      west windsor  2800  615000
7      west windsor  3300  650000
8      west windsor  3600  710000
9       robinsville  2600  575000
10      robinsville  2900  600000
11      robinsville  3100  620000
12      robinsville  3600  695000


In [3]:
dummies = pd.get_dummies(df.town,dtype='int16')
print(dummies)

    monroe township  robinsville  west windsor
0                 1            0             0
1                 1            0             0
2                 1            0             0
3                 1            0             0
4                 1            0             0
5                 0            0             1
6                 0            0             1
7                 0            0             1
8                 0            0             1
9                 0            1             0
10                0            1             0
11                0            1             0
12                0            1             0


In [4]:
df_dummies = pd.concat([dummies,df],axis='columns')
print(df_dummies)

    monroe township  robinsville  west windsor             town  area   price
0                 1            0             0  monroe township  2600  550000
1                 1            0             0  monroe township  3000  565000
2                 1            0             0  monroe township  3200  610000
3                 1            0             0  monroe township  3600  680000
4                 1            0             0  monroe township  4000  725000
5                 0            0             1     west windsor  2600  585000
6                 0            0             1     west windsor  2800  615000
7                 0            0             1     west windsor  3300  650000
8                 0            0             1     west windsor  3600  710000
9                 0            1             0      robinsville  2600  575000
10                0            1             0      robinsville  2900  600000
11                0            1             0      robinsville 

In [5]:
final_df = df_dummies.drop(['town','monroe township'],axis='columns')
print(final_df)

    robinsville  west windsor  area   price
0             0             0  2600  550000
1             0             0  3000  565000
2             0             0  3200  610000
3             0             0  3600  680000
4             0             0  4000  725000
5             0             1  2600  585000
6             0             1  2800  615000
7             0             1  3300  650000
8             0             1  3600  710000
9             1             0  2600  575000
10            1             0  2900  600000
11            1             0  3100  620000
12            1             0  3600  695000


In [6]:
X = final_df.drop('price',axis=1).values
Y = final_df.price.values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,train_size=0.6,random_state=29)

### Select & Train Model

In [7]:
LR = LinearRegression()
LR.fit(X_train, Y_train)
a = LR.coef_
b = LR.intercept_
print(a)
print(b)

[    0.         38654.45859873   126.43312102]
209307.32484075235


### Test

In [8]:
Yp_train = LR.predict(X_train)
print('Yp_train:',np.int64(Yp_train))
print('Y_train: ',Y_train)
print()
print('MSE:',mean_squared_error(Y_train,Yp_train))
print('RMSE:',np.sqrt(mean_squared_error(Y_train,Yp_train)))
print('R2:',r2_score(Y_train,Yp_train))

Yp_train: [665191 588606 538033 703121 613893 664466 576687]
Y_train:  [650000 565000 550000 710000 610000 680000 585000]

MSE: 186300045.4959052
RMSE: 13649.177465909996
R2: 0.9402767273189444


In [9]:
Yp_test = LR.predict(X_test)
print('Yp_test:',np.int64(Yp_test))
print('Y_test: ',Y_test)
print()
print('MSE:',mean_squared_error(Y_test,Yp_test))
print('RMSE:',np.sqrt(mean_squared_error(Y_test,Yp_test)))
print('R2:',r2_score(Y_test,Yp_test))

Yp_test: [715039 664466 575963 601974 601249 538033]
Y_test:  [725000 695000 600000 615000 620000 575000]

MSE: 582834633.6734504
RMSE: 24141.96830570056
R2: 0.7952971042707881


# CÁCH 2: 

### Prepare Data

In [10]:
df2 = df
df2

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [11]:
X = df2.iloc[:,:-1].values
Y = df2.iloc[:,-1].values

In [12]:
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough')
X = ct.fit_transform(X)
X = X[:,1:]
X

array([[0.0, 0.0, 2600],
       [0.0, 0.0, 3000],
       [0.0, 0.0, 3200],
       [0.0, 0.0, 3600],
       [0.0, 0.0, 4000],
       [0.0, 1.0, 2600],
       [0.0, 1.0, 2800],
       [0.0, 1.0, 3300],
       [0.0, 1.0, 3600],
       [1.0, 0.0, 2600],
       [1.0, 0.0, 2900],
       [1.0, 0.0, 3100],
       [1.0, 0.0, 3600]], dtype=object)

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,train_size=0.6,random_state=29)

### Select & Train Model

In [14]:
LR2 = LinearRegression()
LR2.fit(X_train, Y_train)
a = LR2.coef_
b = LR2.intercept_
print(a)
print(b)

[    0.         38654.45859873   126.43312102]
209307.32484075235


### Test

In [15]:
Yp_train = LR2.predict(X_train)
print('Yp_train:',np.int64(Yp_train))
print('Y_train :',Y_train)
print()
print('MSE:',mean_squared_error(Y_train,Yp_train))
print('RMSE:',np.sqrt(mean_squared_error(Y_train,Yp_train)))
print('R2:',r2_score(Y_train,Yp_train))

Yp_train: [665191 588606 538033 703121 613893 664466 576687]
Y_train : [650000 565000 550000 710000 610000 680000 585000]

MSE: 186300045.4959052
RMSE: 13649.177465909996
R2: 0.9402767273189444


In [16]:
Yp_test = LR2.predict(X_test)
print('Yp_test:',np.int64(Yp_test))
print('Y_test :',Y_test)
print()
print('MSE:',mean_squared_error(Y_test,Yp_test))
print('RMSE:',np.sqrt(mean_squared_error(Y_test,Yp_test)))
print('R2:',r2_score(Y_test,Yp_test))

Yp_test: [715039 664466 575963 601974 601249 538033]
Y_test : [725000 695000 600000 615000 620000 575000]

MSE: 582834633.6734504
RMSE: 24141.96830570056
R2: 0.7952971042707881
