In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('homeprices.csv')
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [3]:
#df[['area', 'price']] = MinMaxScaler().fit_transform(df[['area', 'price']])

# Method 1: Using pandas

### Data Preprocessing

In [4]:
dummies = pd.get_dummies(df.town, dtype='int16')
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [5]:
df_dummies = pd.concat([dummies, df], axis='columns')
df_dummies

Unnamed: 0,monroe township,robinsville,west windsor,town,area,price
0,1,0,0,monroe township,2600,550000
1,1,0,0,monroe township,3000,565000
2,1,0,0,monroe township,3200,610000
3,1,0,0,monroe township,3600,680000
4,1,0,0,monroe township,4000,725000
5,0,0,1,west windsor,2600,585000
6,0,0,1,west windsor,2800,615000
7,0,0,1,west windsor,3300,650000
8,0,0,1,west windsor,3600,710000
9,0,1,0,robinsville,2600,575000


In [6]:
final_df = df_dummies.drop(['town','monroe township'], axis='columns')
final_df

Unnamed: 0,robinsville,west windsor,area,price
0,0,0,2600,550000
1,0,0,3000,565000
2,0,0,3200,610000
3,0,0,3600,680000
4,0,0,4000,725000
5,0,1,2600,585000
6,0,1,2800,615000
7,0,1,3300,650000
8,0,1,3600,710000
9,1,0,2600,575000


In [7]:
X = final_df.drop('price',axis=1).values
Y = final_df.price.values

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,train_size=0.7,random_state=29)

### Training

In [9]:
LR = LinearRegression()
LR.fit(X_train, Y_train)

### Evaluating

In [10]:
Yp_train = LR.predict(X_train)
print('Yp_train:',np.int64(Yp_train))
print('Y_train: ',Y_train)
print()
print('MSE:', mean_squared_error(Y_train,Yp_train))
print('RMSE:',np.sqrt(mean_squared_error(Y_train,Yp_train)))
print('R2:',r2_score(Y_train,Yp_train))

Yp_train: [628136 566863 664672 588995 539976 701436 613504 662523 578890]
Y_train:  [620000 575000 650000 565000 550000 710000 610000 680000 585000]

MSE: 161368934.7241979
RMSE: 12703.10728617994
R2: 0.9388210451080738


In [11]:
Yp_test = LR.predict(X_test)
print('Yp_test:',np.int64(Yp_test))
print('Y_test: ',Y_test)
print()
print('MSE:',mean_squared_error(Y_test,Yp_test))
print('RMSE:',np.sqrt(mean_squared_error(Y_test,Yp_test)))
print('R2:',r2_score(Y_test,Yp_test))

Yp_test: [711541 689409 603627 603399]
Y_test:  [725000 695000 600000 615000]

MSE: 90026019.6407693
RMSE: 9488.204236881145
R2: 0.9674665993393041


# Method 2: Using sklearn

### Data Processing

In [12]:
df2 = df
df2

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [13]:
X = df2.iloc[:,:-1].values
Y = df2.iloc[:,-1].values

In [14]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[0])], remainder='passthrough')
X = ct.fit_transform(X)
print(X)
print()

X = X[:, 1:]
print(X)

[[1.0 0.0 0.0 2600]
 [1.0 0.0 0.0 3000]
 [1.0 0.0 0.0 3200]
 [1.0 0.0 0.0 3600]
 [1.0 0.0 0.0 4000]
 [0.0 0.0 1.0 2600]
 [0.0 0.0 1.0 2800]
 [0.0 0.0 1.0 3300]
 [0.0 0.0 1.0 3600]
 [0.0 1.0 0.0 2600]
 [0.0 1.0 0.0 2900]
 [0.0 1.0 0.0 3100]
 [0.0 1.0 0.0 3600]]

[[0.0 0.0 2600]
 [0.0 0.0 3000]
 [0.0 0.0 3200]
 [0.0 0.0 3600]
 [0.0 0.0 4000]
 [0.0 1.0 2600]
 [0.0 1.0 2800]
 [0.0 1.0 3300]
 [0.0 1.0 3600]
 [1.0 0.0 2600]
 [1.0 0.0 2900]
 [1.0 0.0 3100]
 [1.0 0.0 3600]]


In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7,random_state=29)

### Training

In [16]:
LR2 = LinearRegression()
LR2.fit(X_train, Y_train)

### Test

In [17]:
Yp_train = LR2.predict(X_train)
print('Yp_train:',np.int64(Yp_train))
print('Y_train :',Y_train)
print()
print('MSE:',mean_squared_error(Y_train,Yp_train))
print('RMSE:',np.sqrt(mean_squared_error(Y_train,Yp_train)))
print('R2:',r2_score(Y_train,Yp_train))

Yp_train: [628136 566863 664672 588995 539976 701436 613504 662523 578890]
Y_train : [620000 575000 650000 565000 550000 710000 610000 680000 585000]

MSE: 161368934.7241979
RMSE: 12703.10728617994
R2: 0.9388210451080738


In [18]:
Yp_test = LR2.predict(X_test)
print('Yp_test:',np.int64(Yp_test))
print('Y_test :',Y_test)
print()
print('MSE:',mean_squared_error(Y_test,Yp_test))
print('RMSE:',np.sqrt(mean_squared_error(Y_test,Yp_test)))
print('R2:',r2_score(Y_test,Yp_test))

Yp_test: [711541 689409 603627 603399]
Y_test : [725000 695000 600000 615000]

MSE: 90026019.6407693
RMSE: 9488.204236881145
R2: 0.9674665993393041
