## 3. Feature Engineering and Data Preprocessing

### 3.1 Remove unimportant columns

In [79]:
# Remove rentfaster_id, address, link and availability_date, as they are non-numeric or non-categorical
df.drop(columns=['rentfaster_id', 'address', 'link', 'availability_date'], axis=1, inplace=True)
df.sample(3)

Unnamed: 0,city,province,latitude,longitude,lease_term,type,price,beds,baths,sq_feet,furnishing,smoking,cats,dogs
5686,Edmonton,Alberta,53.396781,-113.576332,Long Term,House,2300.0,3 Beds,2.5,1700.0,Unfurnished,Non-Smoking,True,True
16725,Montréal,Quebec,45.507216,-73.569636,Long Term,Apartment,2465.0,1 Bed,1.0,608.0,Unfurnished,Non-Smoking,True,True
15424,Toronto,Ontario,43.66918,-79.486228,Long Term,Apartment,2150.0,1 Bed,1.0,550.0,Unfurnished,Non-Smoking,True,True


### 3.2 Encode non-ordinal categorical columns

We will reduce the cardinality of the columns to 5 or less. If the relative frequency of a value is less than 5%, we will add it in a category named 'Other'.

#### Lease Term

In [83]:
df['lease_term'].value_counts(normalize=True)

lease_term
Long Term     0.935101
Negotiable    0.047414
Short Term    0.011657
12 months     0.003255
Unknown       0.002415
6 months      0.000105
months        0.000053
Name: proportion, dtype: float64

In [84]:
df['lease_term'] = np.where(df['lease_term'] == 'Long Term', 'Long Term', 'Other')
df['lease_term'] = df['lease_term'].astype('category')
df['lease_term'].value_counts(normalize=True)

lease_term
Long Term    0.935101
Other        0.064899
Name: proportion, dtype: float64

#### Type

In [86]:
df['type'].value_counts(normalize=True)

type
Apartment        0.699659
Condo Unit       0.068574
Townhouse        0.055185
House            0.054030
Basement         0.053400
Main Floor       0.024573
Room For Rent    0.021528
Duplex           0.016277
Office Space     0.003150
Storage          0.001103
Parking Spot     0.001103
Loft             0.000788
Acreage          0.000420
Mobile           0.000105
Vacation Home    0.000105
Name: proportion, dtype: float64

In [87]:
df['type'] = np.where(df['type'].isin(['Apartment', 'Condo Unit', 'Townhouse', 'House', 'Basement']), df['type'], 'Other')
df['type'] = df['type'].astype('category')
df['type'].value_counts(normalize=True)

type
Apartment     0.699659
Other         0.069152
Condo Unit    0.068574
Townhouse     0.055185
House         0.054030
Basement      0.053400
Name: proportion, dtype: float64

#### Furnishing

In [261]:
df['furnishing'].value_counts(normalize=True)

furnishing
Unfurnished                0.914466
Furnished                  0.072617
Negotiable                 0.012759
Unfurnished, Negotiable    0.000158
Name: proportion, dtype: float64

In [267]:
df['furnishing'] = np.where(~df['furnishing'].isin(['Unfurnished', 'Furnished']), 'Negotiable', df['furnishing'])
df['furnishing'] = df['furnishing'].astype('category')
df['furnishing'].value_counts(normalize=True)

furnishing
Unfurnished    0.914466
Furnished      0.072617
Negotiable     0.012917
Name: proportion, dtype: float64

#### Smoking

In [271]:
df['smoking'].value_counts(normalize=True)

smoking
Non-Smoking            0.852350
Unknown                0.122447
Smoke Free Building    0.014702
Smoking Allowed        0.007509
Negotiable             0.002993
Name: proportion, dtype: float64

In [173]:
# Encode features using OneHotEncoder
# encoder = OneHotEncoder(drop='first', sparse_output=False)
# encoded_cols = encoder.fit_transform(df[['lease_term', 'type', 'furnishing', 'smoking']])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19045 entries, 0 to 19044
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   city        19045 non-null  float64 
 1   province    19045 non-null  category
 2   latitude    19045 non-null  float64 
 3   longitude   19045 non-null  float64 
 4   lease_term  19045 non-null  category
 5   type        19045 non-null  category
 6   price       19045 non-null  float64 
 7   beds        19045 non-null  category
 8   baths       19045 non-null  category
 9   sq_feet     18865 non-null  float64 
 10  furnishing  19045 non-null  category
 11  smoking     19045 non-null  category
 12  cats        19045 non-null  bool    
 13  dogs        19045 non-null  bool    
dtypes: bool(2), category(7), float64(5)
memory usage: 913.0 KB


#### City and Province

I won't use the same method as the previous categories, because I think it's important to get the pricing for each city and province. We will use Frequency Encoding instead.

In [91]:
# Save frequency mapping of cities and provinces
city_frequency = df['city'].value_counts(normalize=True)
city_frequency

city
Calgary             0.247887
Edmonton            0.134944
Toronto             0.125755
Montréal            0.079601
Ottawa              0.056918
                      ...   
Christopher Lake    0.000053
Priddis             0.000053
Innisfail           0.000053
Langdon             0.000053
Crowsnest Pass      0.000053
Name: proportion, Length: 269, dtype: float64

In [92]:
province_frequency = df['province'].value_counts(normalize=True)
province_frequency

province
Alberta                      0.437175
Ontario                      0.312260
Quebec                       0.109898
British Columbia             0.058598
Saskatchewan                 0.035967
Manitoba                     0.030349
Nova Scotia                  0.013389
Northwest Territories        0.001050
New Brunswick                0.000683
Newfoundland and Labrador    0.000630
Name: proportion, dtype: float64

In [93]:
# Encode data using Frequency Encoding
df['city'] = df['city'].map(city_frequency)
df['province'] = df['province'].map(province_frequency)
df.sample(3)

Unnamed: 0,city,province,latitude,longitude,lease_term,type,price,beds,baths,sq_feet,furnishing,smoking,cats,dogs
12303,0.056918,0.31226,45.409765,-75.67771,Long Term,Apartment,2395.0,1 Bed,1,700.0,Unfurnished,Non-Smoking,True,True
3240,0.247887,0.437175,51.05578,-114.178203,Other,House,7000.0,7 Beds,5,4200.0,Unfurnished,Non-Smoking,True,True
4237,0.247887,0.437175,51.05035,-114.036322,Long Term,Apartment,1825.0,1 Bed,1,597.0,Unfurnished,Non-Smoking,True,False


### 3.3 Encode ordinal categorical columns

#### Beds

In [96]:
df['beds'].value_counts(normalize=True)

beds
2 Beds       0.379995
1 Bed        0.372276
3 Beds       0.135206
Studio       0.073615
4 Beds       0.025466
5 Beds       0.007351
Unknown      0.003465
6 Beds       0.001838
7 Beds       0.000420
9 Beds       0.000158
none Beds    0.000105
8 Beds       0.000105
Name: proportion, dtype: float64

In [97]:
df['beds'] = np.where(
    df['beds'].isin(
        ['4 Beds', '5 Beds', '6 Beds', '7 Beds', '8 Beds', '9 Beds']),
    '4 Beds and More',
    df['beds'])

df['beds'] = np.where(
    df['beds'].isin(
        ['Studio', 'none Beds', 'Unknown']), 
    'Studio and Other', 
    df['beds'])

df['beds'].value_counts(normalize=True)

beds
2 Beds              0.379995
1 Bed               0.372276
3 Beds              0.135206
Studio and Other    0.077186
4 Beds and More     0.035337
Name: proportion, dtype: float64

In [98]:
beds = ['Studio and Other', '1 Bed', '2 Beds', '3 Beds', '4 Beds and More']
df['beds'] = pd.Categorical(df['beds'], categories=beds, ordered=True)
df['beds'].dtype

CategoricalDtype(categories=['Studio and Other', '1 Bed', '2 Beds', '3 Beds',
                  '4 Beds and More'],
, ordered=True, categories_dtype=object)

#### Baths

In [100]:
df['baths'].value_counts(normalize=True)

baths
1          0.675138
2          0.190076
2.5        0.063166
1.5        0.036388
3.5        0.013284
3          0.013074
Unknown    0.003518
4          0.003045
4.5        0.000735
0          0.000683
5          0.000368
5.5        0.000158
6          0.000105
7          0.000105
6.5        0.000053
7.5        0.000053
8          0.000053
Name: proportion, dtype: float64

In [101]:
df['baths'] = np.where(df['baths'].isin(['0', 'Unknown']), 'Other', df['baths'])

df['baths'] = np.where(
    df['baths'].isin(['3', '3.5', '4', '4.5', '5', '5.5', '6', '6.5', '7', '7.5', '8']),
    '3 and More',
    df['baths'])

df['baths'].value_counts(normalize=True)

baths
1             0.675138
2             0.190076
2.5           0.063166
1.5           0.036388
3 and More    0.031032
Other         0.004201
Name: proportion, dtype: float64

In [102]:
# I kept the value 1.5 to keep the order, even if the relative frequency is less than 5%, 
baths = ['Other', '1', '1.5', '2', '2.5', '3 and More']
df['baths'] = pd.Categorical(df['baths'], categories=baths, ordered=True)
df['baths'].dtype

CategoricalDtype(categories=['Other', '1', '1.5', '2', '2.5', '3 and More'], ordered=True, categories_dtype=object)

### 3.4 Encode boolean columns

In [171]:
df.sample()

Unnamed: 0,city,province,latitude,longitude,lease_term,type,price,beds,baths,sq_feet,furnishing,smoking,cats,dogs
16074,0.004411,0.31226,43.012103,-79.250456,Long Term,Apartment,1798.0,1 Bed,1,480.0,Unfurnished,Non-Smoking,True,True


## 4. Regression

### 3.1 Find the model

### 3.2 Make predictions

## End