with one hot encoding you create a column for each of the categories and asign a binary value of 1 or zero in order to prevent the model taking them as ordinal categorical variables instead of nominal ones


One hot encoding makes sure that the model sees them as nominal categorical variables

In [2]:
import pandas as pd

In [3]:
df= pd.read_csv('homeprices.csv')
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   town    13 non-null     object
 1   area    13 non-null     int64 
 2   price   13 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 444.0+ bytes


In [5]:
df.describe()

Unnamed: 0,area,price
count,13.0,13.0
mean,3146.153846,629230.769231
std,453.900475,57621.109914
min,2600.0,550000.0
25%,2800.0,585000.0
50%,3100.0,615000.0
75%,3600.0,680000.0
max,4000.0,725000.0


# We will be using the get_dummies function
to get all the dummy columns that will be used for the one hot encoding

In [6]:
dummy_towns=pd.get_dummies(df['town'],dtype=int)
dummy_towns

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


# Merging the original dataframe with the dummies dataframe using the concat function

In [7]:
merged=pd.concat([df,dummy_towns],axis=1)
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


# Dropping the  town column because it wont be used by our model , and has been replaced by the dummy columns

In [8]:
final=merged.drop(columns='town')
final

Unnamed: 0,area,price,monroe township,robinsville,west windsor
0,2600,550000,1,0,0
1,3000,565000,1,0,0
2,3200,610000,1,0,0
3,3600,680000,1,0,0
4,4000,725000,1,0,0
5,2600,585000,0,0,1
6,2800,615000,0,0,1
7,3300,650000,0,0,1
8,3600,710000,0,0,1
9,2600,575000,0,1,0


# Dummy Variable Trap

When you can derive one variable from other variables, they are known to be multi-colinear. Here
if you know values of monroe township and robinsville then you can easily infer value of new west windsor, i.e. 
monroe township=0 and robinsville=0. Therefore these state variables are called to be multi-colinear. In this
situation linear regression won't work as expected. Hence you need to drop one column. 

**NOTE: sklearn library takes care of dummy variable trap hence even if you don't drop one of the 
    state columns it is going to work, however we should make a habit of taking care of dummy variable
    trap ourselves just in case library that you are using is not handling this for you**

In [9]:
final = final.drop(columns='west windsor')
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [10]:
from sklearn.linear_model import LinearRegression

In [11]:
model= LinearRegression()

In [12]:
# breaking the final dataframe into dependent and independent variables
x= final.drop(columns='price')
y=final['price']

In [13]:
model.fit(x,y)

In [14]:
model.predict(x)

array([539709.7398409 , 590468.71640508, 615848.20468716, 666607.18125134,
       717366.15781551, 579723.71533005, 605103.20361213, 668551.92431735,
       706621.15674048, 565396.15136531, 603465.38378844, 628844.87207052,
       692293.59277574])

# using model.score() to find the percentage accuracy of my model

the model.score() compares the predicted y values using the training data (x), then checks the percentage error between the predicted values and the actual values

In [15]:
model.score(x,y)

0.9573929037221872

<h1 >Using sklearn OneHotEncoder</h1>

In [16]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [17]:
df_new=df

In [18]:
# Using the label encoder le to fit and transform the new dataframe so that it takes care of the text column
df_new['town']=le.fit_transform(df_new['town'])
df_new

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [19]:
y=df_new['price']
x=df_new.drop(columns='price')
x

Unnamed: 0,town,area
0,0,2600
1,0,3000
2,0,3200
3,0,3600
4,0,4000
5,2,2600
6,2,2800
7,2,3300
8,2,3600
9,1,2600


In [20]:
from sklearn.preprocessing import OneHotEncoder
ohe= OneHotEncoder()

In [21]:
new=(ohe.fit_transform(x[['town']])).toarray()
new

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [22]:
new_df= pd.DataFrame(new, columns=ohe.get_feature_names_out(['town']))
new_df

Unnamed: 0,town_0,town_1,town_2
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
5,0.0,0.0,1.0
6,0.0,0.0,1.0
7,0.0,0.0,1.0
8,0.0,0.0,1.0
9,0.0,1.0,0.0


In [23]:
x.drop(columns='town', inplace=True) # dropping the towns column
new_df.drop(columns='town_0', inplace=True) #dropping one column to prevent dummy variable trap
X=pd.concat([x,new_df], axis=1)

In [24]:
model1=LinearRegression()
model1.fit(X,y)

In [25]:
model1.predict(X)

array([539709.7398409 , 590468.71640507, 615848.20468716, 666607.18125134,
       717366.15781552, 579723.71533004, 605103.20361213, 668551.92431735,
       706621.15674048, 565396.1513653 , 603465.38378843, 628844.87207052,
       692293.59277575])

In [26]:
model1.score(X,y)

0.9573929037221871

# model with sklearn transfomation : 0.9573929037221871 accuracy
# model with handwritten one hot encoding : 0.9573929037221872 accuracy