# Canadian Rental Prices Regression Project

## Overview

This notebook ...

## Imports

In [55]:
import numpy as np
import pandas as pd

In [56]:
df = pd.read_csv('../data/canada_rent.csv')

In [57]:
df.head()

Unnamed: 0,rentfaster_id,city,province,address,latitude,longitude,lease_term,type,price,beds,baths,sq_feet,link,furnishing,availability_date,smoking,cats,dogs
0,468622,Airdrie,Alberta,69 Gateway Dr NE,51.305962,-114.012515,Long Term,Townhouse,2495.0,2 Beds,2.5,1403,/ab/airdrie/rentals/townhouse/2-bedrooms/pet-f...,Unfurnished,Immediate,Non-Smoking,True,True
1,468622,Airdrie,Alberta,69 Gateway Dr NE,51.305962,-114.012515,Long Term,Townhouse,2695.0,3 Beds,2.5,1496,/ab/airdrie/rentals/townhouse/2-bedrooms/pet-f...,Unfurnished,Immediate,Non-Smoking,True,True
2,468622,Airdrie,Alberta,69 Gateway Dr NE,51.305962,-114.012515,Long Term,Townhouse,2295.0,2 Beds,2.5,1180,/ab/airdrie/rentals/townhouse/2-bedrooms/pet-f...,Unfurnished,Immediate,Non-Smoking,True,True
3,468622,Airdrie,Alberta,69 Gateway Dr NE,51.305962,-114.012515,Long Term,Townhouse,2095.0,2 Beds,2.5,1403,/ab/airdrie/rentals/townhouse/2-bedrooms/pet-f...,Unfurnished,November 18,Non-Smoking,True,True
4,468622,Airdrie,Alberta,69 Gateway Dr NE,51.305962,-114.012515,Long Term,Townhouse,2495.0,2 Beds,2.5,1403,/ab/airdrie/rentals/townhouse/2-bedrooms/pet-f...,Unfurnished,Immediate,Non-Smoking,True,True


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25771 entries, 0 to 25770
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rentfaster_id      25771 non-null  int64  
 1   city               25771 non-null  object 
 2   province           25771 non-null  object 
 3   address            25646 non-null  object 
 4   latitude           25771 non-null  float64
 5   longitude          25771 non-null  float64
 6   lease_term         25725 non-null  object 
 7   type               25771 non-null  object 
 8   price              25771 non-null  float64
 9   beds               25639 non-null  object 
 10  baths              25637 non-null  object 
 11  sq_feet            21659 non-null  object 
 12  link               25771 non-null  object 
 13  furnishing         25771 non-null  object 
 14  availability_date  25759 non-null  object 
 15  smoking            23069 non-null  object 
 16  cats               255

## Data Cleaning

### Convert `beds`, `baths` and `sq_feet` to numerical format

#### `beds`

In [59]:
df['beds'].unique()

array(['2 Beds', '3 Beds', 'Studio', '1 Bed', '5 Beds', '4 Beds', nan,
       '6 Beds', 'none Beds', '8 Beds', '7 Beds', '9 Beds'], dtype=object)

In [60]:
# Get proportion of null beds
print(f'{df['beds'].isna().mean():.2%}')

0.51%


In [61]:
# Remove Bed(s) after numbers
df['beds'] = df['beds'].str.replace(r'(?<=\d)\s+Bed[s]*', '', regex=True)

In [62]:
# Replace Studio and none Beds by 0
df['beds'] = np.where(df['beds'].isin(['Studio', 'none Beds']), 0, df['beds'])

In [63]:
# Drop rows with nan beds
df = df.dropna(subset='beds')

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25639 entries, 0 to 25770
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rentfaster_id      25639 non-null  int64  
 1   city               25639 non-null  object 
 2   province           25639 non-null  object 
 3   address            25516 non-null  object 
 4   latitude           25639 non-null  float64
 5   longitude          25639 non-null  float64
 6   lease_term         25593 non-null  object 
 7   type               25639 non-null  object 
 8   price              25639 non-null  float64
 9   beds               25639 non-null  object 
 10  baths              25637 non-null  object 
 11  sq_feet            21658 non-null  object 
 12  link               25639 non-null  object 
 13  furnishing         25639 non-null  object 
 14  availability_date  25631 non-null  object 
 15  smoking            22937 non-null  object 
 16  cats               25596 no

In [65]:
# Convert beds to numerical
df['beds'] = df['beds'].astype('int64')
df['beds'].describe()

count    25639.000000
mean         1.743087
std          0.974852
min          0.000000
25%          1.000000
50%          2.000000
75%          2.000000
max          9.000000
Name: beds, dtype: float64

#### `baths`

In [66]:
df['baths'].unique()

array(['2.5', '1', '2', '1.5', '3.5', '4', '3', '5', 'none', '4.5', '7.5',
       nan, '5.5', '6', '6.5', '7', '8', '0'], dtype=object)

In [67]:
# Get proportion of null baths
print(f'{df['baths'].isna().mean():.2%}')

0.01%


In [68]:
# Replace none by 0
df['baths'] = np.where(df['baths'] == 'none', 0, df['baths'])

In [69]:
# Drop rows with nan baths
df = df.dropna(subset='baths')

In [70]:
# Convert to float
df['baths'] = df['baths'].astype('float')

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25637 entries, 0 to 25770
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rentfaster_id      25637 non-null  int64  
 1   city               25637 non-null  object 
 2   province           25637 non-null  object 
 3   address            25514 non-null  object 
 4   latitude           25637 non-null  float64
 5   longitude          25637 non-null  float64
 6   lease_term         25591 non-null  object 
 7   type               25637 non-null  object 
 8   price              25637 non-null  float64
 9   beds               25637 non-null  int64  
 10  baths              25637 non-null  float64
 11  sq_feet            21656 non-null  object 
 12  link               25637 non-null  object 
 13  furnishing         25637 non-null  object 
 14  availability_date  25629 non-null  object 
 15  smoking            22935 non-null  object 
 16  cats               25596 no

#### `sq_feet`

In [72]:
df['sq_feet'].unique()

array(['1403', '1496', '1180', ..., '260', '286', '334'],
      shape=(1896,), dtype=object)

In [73]:
# Get proportion of null sq_feet
print(f'{df['sq_feet'].isna().mean():.2%}')

15.53%


In [74]:
# Extract number from string
df['sq_feet'] = df['sq_feet'].str.extract(r'(\d+)', expand=False)

In [75]:
# Convert to numeric
df['sq_feet'] = df['sq_feet'].astype('float')

In [76]:
# Group-wise median imputation
df['sq_feet'] = df['sq_feet'].fillna(df.groupby(['city', 'type'])['sq_feet'].transform('median'))

In [77]:
# Get new proportion of null sq_feet
print(f'{df['sq_feet'].isna().mean():.2%}')

0.62%


In [78]:
# Fill remaining null values with overall median
df['sq_feet'] = df['sq_feet'].fillna(df['sq_feet'].median())

In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25637 entries, 0 to 25770
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rentfaster_id      25637 non-null  int64  
 1   city               25637 non-null  object 
 2   province           25637 non-null  object 
 3   address            25514 non-null  object 
 4   latitude           25637 non-null  float64
 5   longitude          25637 non-null  float64
 6   lease_term         25591 non-null  object 
 7   type               25637 non-null  object 
 8   price              25637 non-null  float64
 9   beds               25637 non-null  int64  
 10  baths              25637 non-null  float64
 11  sq_feet            25637 non-null  float64
 12  link               25637 non-null  object 
 13  furnishing         25637 non-null  object 
 14  availability_date  25629 non-null  object 
 15  smoking            22935 non-null  object 
 16  cats               25596 no

### Handle Missing Values

### Drop irrelevant columns