In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression

In [6]:
df = pd.read_csv("homeprices.csv")
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


# #1 Using Pandas Library

In [4]:
dummy_variables = pd.get_dummies(df['town'])  # Returns a Dataframe

merged_dummy = pd.concat([df, dummy_variables], axis = 'columns')   # Parameter: Pass array of DataFrames to concatinate & Axis to how to concatinate
merged_dummy.to_csv("new_hometownprices.csv", index = False)

In [4]:
# To train model, We don't need town col (Words Col) & We need to Remove / drop any one of the Dummy variable
final_df = merged_dummy.drop(['town', 'west windsor'], axis = 'columns')
final_df

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,True,False
1,3000,565000,True,False
2,3200,610000,True,False
3,3600,680000,True,False
4,4000,725000,True,False
5,2600,585000,False,False
6,2800,615000,False,False
7,3300,650000,False,False
8,3600,710000,False,False
9,2600,575000,False,True


In [5]:
# Training
model = LinearRegression()

# X = final_df[['area', 'monroe township', 'robinsville']]  # OR
X = final_df.drop('price', axis = 'columns')
y = final_df['price']

model.fit(X, y)
print("Model is", "{:.3f}".format(model.score(X, y)*100), '% Accurate')

Model is 95.739 % Accurate


In [6]:
# Prediction
print("{:.3f}".format(model.predict([[3000, 1, 0]])[0]))   # For monroe township
print("{:.3f}".format(model.predict([[3000, 0, 1]])[0]))   # For robinsville
print("{:.3f}".format(model.predict([[3000, 0, 0]])[0]))   # For west windsor


590468.716
616155.128
630482.692




# #2 Using SKLearn - OneHotEncoder Library

In [7]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [8]:
dfle = pd.read_csv("homeprices.csv")

# Using Label Encoder - We are Numbering / Encoding all the Words / Labels
le = LabelEncoder()

encoded_town = le.fit_transform(dfle['town'])
dfle['town'] = encoded_town
dfle


Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [9]:
# Training with X, y parameters

features = dfle[['town', 'area']].values   # .values Dataframe -> Numpy nD Array, As we want X as 2D Array
dependent = dfle['price'].values
features, dependent

(array([[   0, 2600],
        [   0, 3000],
        [   0, 3200],
        [   0, 3600],
        [   0, 4000],
        [   2, 2600],
        [   2, 2800],
        [   2, 3300],
        [   2, 3600],
        [   1, 2600],
        [   1, 2900],
        [   1, 3100],
        [   1, 3600]]),
 array([550000, 565000, 610000, 680000, 725000, 585000, 615000, 650000,
        710000, 575000, 600000, 620000, 695000]))

In [10]:
# Using OneHotEncoder -> We create dummy Variables for each town

ct = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder = 'passthrough')
ct

In [11]:
# monroe township, robinsville, west windsor, area

features = ct.fit_transform(features)
features

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

In [12]:
# To make it more simple & efficient for the model, we remove one of the dummy variables
features = features[:, 1:]   # Droping monroe township town
features

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [13]:
# Training Part

my_model = LinearRegression()
my_model.fit(features, dependent)
print("My Model is", "{:.2f}".format(my_model.score(features, dependent)*100), "% Accurate")

My Model is 95.74 % Accurate


In [19]:
print(my_model.coef_, my_model.intercept_)

[25686.4115244  40013.97548914   126.89744141] 209776.39217373997


In [17]:
# Prediction

print("West Winsdor, Area: 3400, Price:", model.predict([[0,1,3400]])[0])
print("Robbin Sville, Area: 2800, Price:", "{:.3f}".format(model.predict([[1,0,2800]])[0]))
print("Monroe Township, Area: 3000, Price:", "{:.3f}".format(model.predict([[0,0,3000]])[0]))

West Winsdor, Area: 3400, Price: -48503941.087938026
Robbin Sville, Area: 2800, Price: -39867261.836
Monroe Township, Area: 3000, Price: -42732901.527


