In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as pyo
import plotly.express as px
import plotly.graph_objects as go
import re

# explore data, estimate statistical models, and perform statistical tests
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

In [98]:
df = pd.read_csv("datafiles/cleaned_air_bnb_Jul_26.csv")
df.head()

Unnamed: 0,url,name,numberOfGuests,address,roomType,bathroomLabel,maxNights,minNights,localizedCheckInTimeWindow,localizedCheckOutTime,...,allowsSmoking,allowsInfants,personCapacity,Accuracy,Check-in,Cleanliness,Communication,Location,Review Count,Value
0,https://www.airbnb.com/rooms/31573201,Avocado B&B and Cottages,4,"Ngong, Kenya",Private room in bed and breakfast,1 shared bath,1125,2,After 3:00 PM,,...,True,True,4,4.5,5.0,5.0,5.0,4.5,2.0,5.0
1,https://www.airbnb.com/rooms/38121290,Traditional Africa Mud House in Nairobi,2,"Nairobi, Nairobi County, Kenya",Private room in earthen home,1 shared bath,1125,1,9:00 AM - 11:00 PM,,...,True,True,2,,,,,,0.0,
2,https://www.airbnb.com/rooms/35701139,A private house,2,"Ngong, Wilaya ya Kajiado, Kenya",Shared room in rental unit,1 shared bath,1125,1,After 3:00 PM,,...,False,True,2,,,,,,0.0,
3,https://www.airbnb.com/rooms/48292462,BAS NGONG HOMESTAY,6,"Ngong, Kajiado County, Kenya",Private room in home,3 baths,1125,2,Flexible,,...,True,True,6,,,,,,0.0,
4,https://www.airbnb.com/rooms/42577961,"Art and life lovers community,We love life les...",3,"Ngong, Kajiado County, Kenya",Private room in home,3.5 shared baths,1125,3,,,...,True,False,3,,,,,,0.0,


In [100]:
df.drop(['hostId','memberSince','url','name','address','numberOfLanguages',], axis=1,inplace = True)

# Drop columns with more than 25% missing values
def drop_columns(df, threshold):
    for col in df.columns:
        if df[col].isnull().sum() > threshold:
            df.drop(col, axis=1, inplace=True)
    return df

df= drop_columns(df, 0.25*len(df))


#grouping the room complimentaries to one column named + amenities 
amenities_list = ['Washer', 'Shampoo', 'Hair dryer', 'Air conditioning', 'Private entrance']
df['amenities'] = 0
for amenity in amenities_list:
    df['amenities'] = df['amenities'] | df[amenity]

#droping complimentaries columns:
df.drop(amenities_list, axis=1,inplace = True)

beds = ['double_bed', 'floor_mattress', 'single_bed', 'queen_bed', 'couch', 
'king_bed', 'air_mattress', 'sofa_bed', 'small_double_bed', 
'bunk_bed', 'toddler_bed', 'crib', 'hammock', 'water_bed']

regular_bed = ['double_bed', 'floor_mattress', 'single_bed', 'queen_bed', 'king_bed']
relaxing_bed = ['couch', 'air_mattress', 'sofa_bed', 'small_double_bed', 'bunk_bed', 'hammock', 'water_bed']
kids_bed = ['toddler_bed', 'crib']

# Create columns for each category
df['regular_bed'] = 0
df['relaxing_bed'] = 0
df['kids_bed'] = 0

#iterating through the list and categorising
for bed_type in beds:
    if bed_type in regular_bed:
        df['regular_bed'] = df['regular_bed'] | df[bed_type].astype(int)
    elif bed_type in relaxing_bed:
        df['relaxing_bed'] = df['relaxing_bed'] | df[bed_type].astype(int)
    elif bed_type in kids_bed:
        df['kids_bed'] = df['kids_bed'] | df[bed_type].astype(int)

df.drop(beds, axis=1,inplace = True)

# Bedrooms columns
df['numberOfBedrooms'].fillna(1, inplace=True) #mean is 1


#checking median of the column:
dff = df[df['numberOfBedrooms'] != "Studio"]

#distribution of values in the column is relatively normal ; median
bedroom_median = dff.numberOfBedrooms.median()

#replace Studio with median of the value
df.loc[df['numberOfBedrooms'] == "Studio", "numberOfBedrooms"] = bedroom_median

#changing the column dtype to float
df['numberOfBedrooms'] = df['numberOfBedrooms'].astype(float)

#fill null value with median
df['numberOfBedsAvailable'].fillna(df['numberOfBedsAvailable'].median(), inplace=True)

df["city"] = df["city"].replace("内罗毕", "Nairobi")
df['city'].fillna(df['city'].mode()[0], inplace=True)
df["state"] = df["state"].replace("内罗毕特区", "Nairobi County")
df['state'].fillna(df['state'].mode()[0], inplace=True)
df["localizedCity"] = df["localizedCity"].replace("内罗毕", "Nairobi")
df['localizedCity'].fillna(df['localizedCity'].mode()[0], inplace=True)

df['Review Count'].fillna(df['Review Count'].median(),inplace= True)
df.drop(["localizedCheckInTimeWindow"], axis=1,inplace = True)

#price anlaysis:
df['price'] = pd.to_numeric(df['price'], errors='coerce')

df['price'].fillna(df['price'].median(), inplace=True)

df['numberofbathroom']=df.bathroomLabel.str.extract('(\d+)')

value = df['numberofbathroom'].mode()[0] 

# Fill NaN values in 'bathroomLabel' column with the mode value
df['numberofbathroom'] = df['numberofbathroom'].fillna(value)

df['bathroomLabel'].fillna(df['bathroomLabel'].mode()[0], inplace=True)
df['bathroomType'] = df['bathroomLabel'].str.lower()

# Classifying  as -  'private', 'shared', or 'unknown'
df.loc[df['bathroomType'].str.contains('private'), 'bathroomType'] = 'private'
df.loc[df['bathroomType'].str.contains('shared'), 'bathroomType'] = 'shared'
df.loc[~df['bathroomType'].str.contains('private|shared'), 'bathroomType'] = 'unknown'

df.drop('bathroomLabel', axis=1,inplace = True)
df.drop('localizedCity', axis=1,inplace = True)

#convert all boolean column(true or False ) to numerical

for column in df.columns:
    if df[column].dtype in [bool]:
        # Convert boolean columns to numeric (1 or 0)
        df[column] = df[column].astype(int)

mapping = {
    'Private room in bed and breakfast': 'Private room',
    'Private room in earthen home': 'Private room', 
    'Shared room in rental unit': 'Shared room',
    'Private room in home': 'Private room',
    'Entire condo': 'Entire unit',
    'Private room in rental unit': 'Private room',
    'Entire rental unit': 'Entire unit',
    'Private room in guest suite': 'Private room',
    'Entire guesthouse': 'Entire unit',
    'Entire loft': 'Entire unit',
    'Shared room in home': 'Shared room',
    'Entire home': 'Entire unit',
    'Entire guest suite': 'Entire unit',
    'Entire vacation home': 'Entire unit',
    'Private room in townhouse': 'Private room',
    'Barn': 'Other',
    'Entire serviced apartment': 'Entire unit',
    'Entire bungalow': 'Entire unit',
    'Tiny home': 'Other',
    'Private room in serviced apartment': 'Private room',
    'Entire villa': 'Entire unit',
    'Private room in condo': 'Private room',
    'Room in hotel': 'Other',
    'Private room in bungalow': 'Private room',
    'Private room in casa particular': 'Private room',
    'Room in boutique hotel': 'Other',
    'Private room in guesthouse': 'Private room',
    'Private room in cottage': 'Private room',
    'Room in bed and breakfast': 'Other',
    'Private room in farm stay': 'Private room',
    'Entire cottage': 'Entire unit',
    'Private room in loft': 'Private room',
    'Private room in tiny home': 'Private room',
    'Private room in nature lodge': 'Private room',
    'Tent': 'Other',
    'Farm stay': 'Other',
    'Shared room in farm stay': 'Shared room',
    'Island': 'Other',
    'Private room in tent': 'Private room',
    'Entire cabin': 'Entire unit',
    'Room in nature lodge': 'Other',
    'Campsite': 'Other',
    'Entire townhouse': 'Entire unit',
    'Hut': 'Other',
    'Private room in resort': 'Private room',
    'Entire chalet': 'Entire unit',
    'Shipping container': 'Other',
    'Treehouse': 'Other',
    'Private room in camper/rv': 'Private room',
    'Room in resort': 'Other',
    'Entire place': 'Entire unit',
    'Shared room in hotel': 'Shared room',
    'Private room in villa': 'Private room',
    'Private room': 'Private room',
    'Room in serviced apartment': 'Other',
    'Earthen home': 'Other',
    'Shared room in townhouse': 'Shared room',
    'Private room in chalet': 'Private room',
    'Private room in vacation home': 'Private room',
    'Shared room': 'Shared room',
    'Entire bed and breakfast': 'Entire unit',
    'Shared room in hostel': 'Shared room',
    'Private room in treehouse': 'Private room',
    'Private room in hut': 'Private room',
    'Shared room in guesthouse': 'Shared room',
    'Shared room in vacation home': 'Shared room',
    'Private room in holiday park': 'Private room',
    'Tipi': 'Other',
    'Shared room in bed and breakfast': 'Shared room',
    'Shared room in boutique hotel': 'Shared room',
    'Room in aparthotel': 'Other',
    'Casa particular': 'Other',
    'Cave': 'Other',
    'Tower': 'Other',
    'Train': 'Other',
    'Private room in dome': 'Private room',
    'Dome': 'Other',
    'Bus': 'Other',
    'Shared room in tiny home': 'Shared room',
    'Private room in cabin': 'Private room',
    'Private room in island': 'Private room',
    'Shared room in hut': 'Shared room',
    'Shared room in loft': 'Shared room',
    'Shared room in bungalow': 'Shared room',
    'Shared room in condo': 'Shared room',
    'Shared room in serviced apartment': 'Shared room',
    'Castle': 'Other',
    'Boat': 'Other',
    'Lighthouse': 'Other',
    'Entire home/apt': 'Entire unit',
    'Private room in hostel': 'Private room',
    'Shared room in ryokan': 'Shared room'
}

# Apply the custom mapping to the 'roomType' column
df['roomType'] = df['roomType'].map(mapping)

df.drop('personCapacity', axis=1,inplace = True)

df.rename(columns = {'Review Count':'review_count'}, inplace = True)

state_list = ['Nairobi County', 'Wilaya ya Kajiado', 'Kajiado County',
    'Kiambu County', 'Nairobi', 'Machakos County', 'Eastern',
    'Central', 'Rift Valley', 'Kenya, Nairobi', 'Narok County',
    'Bomet County', 'Kisii County', 'Nakuru County', 'Homa Bay County',
    'Wilaya ya Narok', 'Nyamira County', 'Kericho County',
    'Wilaya ya Kisii Kati', 'Wilaya ya Nakuru', 'Narok', 'Nakuru',
    'Nyanza', 'Laikipia County', 'Meru County', 'Nyeri County',
    'Wilaya ya Isiolo', 'Samburu County', 'Wilaya ya Laikipia',
    'Laikipia', 'Kirinyaga', 'Nyeri', 'Meru', 'Uasin Gishu County',
    'Elgeyo-Marakwet County', 'Wilaya ya Uasin Gishu', 'Naivasha',
    'Nyahururu', 'Kenya', 'Nakuru ', 'Nyandarua County',
    'Taita-Taveta County', 'Kilimanjaro Region', 'Coast',
    'Mombasa County', 'Kwale', 'Kwale County', 'Wilaya ya Kwale',
    'Ukunda, Kwale County', 'Kilifi County', 'Kwale District',
    'Mombasa', 'kenya', 'Wilaya ya Mombasa', 'coast', 'Kwale ',
    'Diani', 'Kisumu County', 'Kakamega County', 'Annex', 'Kisumu',
    'Kakamega', 'Nandi County', ' Rift Valley', 'Wilaya ya Kisumu',
    'Vihiga County', 'Elgeyo Marakwet', 'Trans-Nzoia County',
    'Uasin-Gishu', 'Tharaka-Nithi County', 'Embu County',
    'Makueni County', 'Kitui County', 'Wilaya ya Makueni',
    'Wilaya ya Kakamega', 'Wilaya ya Vihiga', 'Kilifi',
    'Wilaya ya Kilifi', 'Root Node', 'Galu Beach', 'South coast',
    'Nyali', 'Mtwapa', 'Nyali estate', 'Westlands, Nairobi',
    'Kaijado County', 'Kajiado', 'Langata', 'Kiambu', 'Westlands',
    'Nairobi Area', 'Kikuyu', 'Nyamira', 'Wilaya ya Kiambu',
    'Kajiado North County', 'Kileleshwa', 'West Pokot County',
    'Turkana County', 'Isiolo County', 'Kirinyaga County',
    'Kaunti ya Meru', 'Eastern Region', 'Machakos', 'Muranga County',
    'Kenia', 'Kisaju', 'Nyali estate,', 'Homa Bay', 'Migori County',
    'Bungoma County', 'Busia County', 'Siaya', 'Bungoma',
    'western kenya', 'Mara Region', 'Kendu bay', 'Texas',
    'Nairobi City', 'Nairobi, Kenya', 'Wanyee Cl, Nairobi, Kenya',
    'Ruaka', 'South B', 'Nairobi-Upper hill Area',
    'Wilaya ya Machakos', 'Embakasi East', 'Nairobi ', 'NAIROBI',
    'Coast Province', 'Malindi', 'Provincia costiera', 'kilifi',
    'Distretto di Kilifi', 'KF', 'Watamu', 'Kilifi Province',
    'Kilifi, Watamu']
city_list = ['Ngong', 'Nairobi', 'Ruaka', 'Pridelands', 'Nairobi City, Kenya',
    'Ongata Rongai', 'Kiambu', 'Athi River', 'Ruiru', 'Nairobi City',
    'Muthaiga North,', 'Kiambu District', 'Kitengela', 'Kiserian',
    'South', 'New Njiru Town', 'Nairobi - Lavington', 'North',
    'Limuru road', 'Mlolongo', 'Westlands', 'Kajiado',
    'Seganani Masai Mara national reserve', 'Bomet', 'Narok',
    'Highway', 'Entasekera', 'Nakuru', 'Kadongo', 'Mau Narok',
    'Lake Elmenteita', 'Ikonge', 'Litein', 'Naivasha ', 'Keroka',
    'Ololaimutiek Village', 'Naivasha', 'Kongoni', 'Gilgil', 'Kisii',
    'Talek', 'Nyanchwa Hill', 'Narok County', 'Aitong', 'Lolgorien',
    'Maasai Mara', 'Oyugis', 'Sekenani', 'Masai Mara', 'Ewaso Ngiro',
    'SEKENANI', 'Silibwet', 'Nanyuki', 'Nchiru', 'Isiolo', 'Dol Dol',
    'Wamba', 'Meru', 'Rukanga, Sagana', 'East', 'Meru District',
    'Timau', 'Maua', 'Eldoret', 'Iten', 'Elmenteita', 'Lake Naivasha',
    'Njoro', 'Eburru', 'Kasuku', 'Nakuru town', 'Voi', 'Mwatate',
    'Taveta', 'Maungu', 'Wundanyi', 'Same', 'Mombasa', 'Diani Beach',
    'Mtwapa', 'Tiwi', 'DIANI BEACH', 'Kwale', 'Ukunda', 'Nyali Beach',
    'Tiwi Beach', 'Galu Beach', 'Nyali Mombasa', 'Nyali',
    'Diani Beach ', 'Diani', 'Diani Beach Road', 'Kisumu',
    'Isukha ICHINA', 'Kakamega', 'Milimani', 'Kapsabet', 'Kisumu City',
    'Gisambai', 'Kapseret', 'Kitale', 'Vihiga', 'Soy', 'Malava',
    'Naro Moru, Nanyuki', 'Naro Moru', 'Ol Kalou', 'Rumuruti',
    'Nanyuki ', 'Nyahururu', 'Laikipia', 'Chuka', 'Gatunga', 'Siakago',
    'Chogoria', 'Igoji', 'Machakos', 'ndagani', 'Matuu', 'Wote',
    'Karurumo', 'Matinyani', 'Mutomo', 'Kitui', 'Mtito Andei',
    'Syongila', 'Kibwezi', 'Luanda', 'Chavakali', 'Maragoli', 'Kilifi',
    'Malindi', 'Kikambala', 'Mtwapa, Mombasa', 'Kaloleni',
    'Mida Creek', 'Gede', 'Watamu', 'Malindi - Mambrui', 'Gongoni',
    'Mtwapa Creek', 'Mariakani', 'Msambweni', 'Mambrui',
    'Utange-Mombasa ', 'Waa', 'Mombasa Bamburi Beach', 'Bamburi',
    'Off Diani Beach Road', 'Diane', 'Galu Kinondo Beach', 'Shimoni',
    'Wasini Island', 'Mombasa ', 'Mombasa Kenya, Box 42961-80100',
    'Kikuyu', 'Limuru', 'Kaijado', 'Tigoni', 'Juja', 'Olooloitikosh',
    'Tigoni Dam', 'Kiserian, Rift Valley, KE', 'Karen Nairobi',
    'Karen', ' Mombasa Road', 'Githurai', 'Ngong Hills', 'Karen/Hardy',
    'Embakasi', 'Ngenda', 'Magadi', 'Limuru Town', 'Thika', 'Ndenderu',
    'Ruaka Town', 'Kahawa Sukari', 'Underpass', 'Rironi',
    'Banana Hill', 'Nyamira', 'Tatu City', 'Limuru Town.', 'Mnagei',
    'Makutano', 'Lokichar', 'Mount Kenya', 'Nanyuki - Timau',
    'Archers Post', 'Ruiri Town', 'Malili', 'Kathonzweni', 'Kimana',
    'Merrueshi', 'Sultan Hamud', 'Nkubu', "Murang'a", 'Kutus',
    'Gaichanjiru', 'Gitugi', 'Kabati', 'Tuthu', 'Kagio', 'Sagana',
    'Ndakaini', 'Syokimau', 'Jacaranda Kenia ', 'Langata',
    'Rusinga Islands', 'MIrogi', 'Mbita', 'Kendu Bay', 'Homa Bay',
    'Kagan', 'Homa Bay Town', 'Muhuru', 'Nyangweso', 'Rongo', 'Suneka',
    'Sare', 'Migori', 'Sindo', 'Mfangano Island', 'Bungoma', 'Malaba',
    'Siaya', 'Chwele', 'Shianda', 'Mumias', 'Miendo', 'Webuye',
    'Kisian', 'Gucha', 'Kilimani', 'Batians Lane', 'Nyeri', 'Kerugoya',
    'Kiriani', 'Kiganjo', 'Iria-Ini', 'Karatina', 'Embu', 'Kimunye',
    'Kibugu', 'Kirinyaga District', 'Runyenjes Town', 'Runyenjes',
    'Matayos', 'Ugunja', 'Busia', 'Funyula', 'Tarime', 'Kericho',
    'Longisa', 'Usenge', 'Bondo', 'Kendu bay', 'Rusinga East',
    'Asembo', 'Cedar Hill', 'Kisaju', 'South B',
    'Hurlingham kilimani Nairobi', 'Starehe', 'Muchatha', 'Elgon Road',
    'Kawaida', 'Wangige', 'Athi River ', ' Athi River',
    'Kenyatta Road', 'Isinya', 'Kangundo', 'Nairobi ', 'Tuala',
    'Mwala', 'NAIROBI ', ' Vipingo', 'Mtwapa ', 'Casuarina', 'Vipingo',
    'Kilifi County', 'Watamu ', 'Uyombo', 'Ватаму', 'Takaungu',
    'kilifi creek', 'Kenya', 'Takaungu Creek', 'Kilifi ',
    'Kilifi, Watamu ', 'Mayungu', 'NYANDARUA ', 'Aberdare Range']

for stateindf in df['state']:
    for i in range(len(state_list)):
        if str(stateindf).lower().find(str(state_list[i]).lower()) != -1:
            df['state'] = df['state'].replace([stateindf], state_list[i]) 

for cityindf in df['city']:
    for i in range(len(city_list)):
        if str(cityindf).lower().find(str(city_list[i]).lower()) != -1:
            df['city'] = df['city'].replace([cityindf], city_list[i])


In [124]:
df['numberofbathroom'] = df['numberofbathroom'].astype(float)

In [125]:
categorical_df = df.select_dtypes(include=['object'])
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()


numeric_df = df.select_dtypes(include=np.number)
numeric_columns = df.select_dtypes(include=np.number).columns.tolist()

In [126]:
print("Count of categorical columns: ", len(categorical_columns))
print(categorical_columns)
print("Count of numerical columns: ", len(numeric_columns))
print(numeric_columns)

Count of categorical columns:  4
['roomType', 'city', 'state', 'bathroomType']
Count of numerical columns:  19
['numberOfGuests', 'maxNights', 'minNights', 'latitude', 'longitude', 'price', 'numberOfBedsAvailable', 'numberOfBedrooms', 'allowsChildren', 'allowsEvents', 'allowsPets', 'allowsSmoking', 'allowsInfants', 'review_count', 'amenities', 'regular_bed', 'relaxing_bed', 'kids_bed', 'numberofbathroom']


In [127]:
df.columns

Index(['numberOfGuests', 'roomType', 'maxNights', 'minNights', 'city', 'state',
       'latitude', 'longitude', 'price', 'numberOfBedsAvailable',
       'numberOfBedrooms', 'allowsChildren', 'allowsEvents', 'allowsPets',
       'allowsSmoking', 'allowsInfants', 'review_count', 'amenities',
       'regular_bed', 'relaxing_bed', 'kids_bed', 'numberofbathroom',
       'bathroomType'],
      dtype='object')

In [152]:
import numpy as np 
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder,StandardScaler


categorical_cols = ['roomType', 'city', 'state', 'bathroomType']
numerical_cols = ['numberOfGuests', 'maxNights', 'minNights', 'latitude', 'longitude', 
'numberOfBedsAvailable', 'numberOfBedrooms', 'allowsChildren', 'allowsEvents', 
'allowsPets', 'allowsSmoking', 'allowsInfants', 'review_count', 'amenities', 'regular_bed', 
'relaxing_bed', 'kids_bed', 'numberofbathroom']

# Define the custom ranking for each ordinal variable
#state_categories = ['Baringo','Bomet','Bungoma','Busia','Elgeyo','Mara','Embu','Garissa','Homa Bay','Isiolo','Kajiado','Kakamega','Kericho','Kiambu','Kilifi','Kirinyaga','Kisii','Kisumu','Kitui','Kwale','Laikipia','Lamu','Machakos','Makueni','Mandera','Marsabit','Meru','Migori','Mombasa','Muranga','Nairobi','Nakuru','Nandi','Narok','Nyamira','Nyandarua','Nyeri','Samburu','Siaya','Taveta','Tana','Tharaka-Nithi','Trans-Nzoia','Turkana','Uasin Gishu','Vihiga','Wajir','West Pokot']
#city_categories = ['Baragoi','Bondo','Bungoma','Busia','Butere','Dadaab','Diani Beach','Eldoret','Emali','Embu','Garissa','Gede','Gem','Hola','Homa Bay','Isiolo','Kitui','Kibwezi','Kajiado','Kakamega','Kakuma','Kapenguria','Kericho','Keroka','Kiambu','Kilifi','Kisii','Kisumu','Kitale','Lamu','Langata','Litein','Lodwar','Lokichoggio','Londiani','Loyangalani','Machakos','Makindu','Malindi','Mandera','Maralal','Marsabit','Meru','Mombasa','Moyale','Mtwapa','Mumias','Muranga','Mutomo','Nairobi','Naivasha','Nakuru','Namanga','Nanyuki','Naro Moru','Narok','Nyahururu','Nyeri','Ruiru','Siaya','Shimoni','Takaungu','Thika','Ugunja','Vihiga','Voi','Wajir','Watamu','Webuye','Wote','Wundanyi']
#'roomType_categories', 'city_categories', 'state_categories', '', 'bathroomType_categories'

roomType_categories = ['Private room', 'Shared room', 'Entire unit', 'Other']
bathroomType_categories = ['shared', 'private', 'unknown']
state_categories = ['Nairobi County', 'Wilaya ya Kajiado', 'Kajiado County',
    'Kiambu County', 'Nairobi', 'Machakos County', 'Eastern',
    'Central', 'Rift Valley', 'Kenya, Nairobi', 'Narok County',
    'Bomet County', 'Kisii County', 'Nakuru County', 'Homa Bay County',
    'Wilaya ya Narok', 'Nyamira County', 'Kericho County',
    'Wilaya ya Kisii Kati', 'Wilaya ya Nakuru', 'Narok', 'Nakuru',
    'Nyanza', 'Laikipia County', 'Meru County', 'Nyeri County',
    'Wilaya ya Isiolo', 'Samburu County', 'Wilaya ya Laikipia',
    'Laikipia', 'Kirinyaga', 'Nyeri', 'Meru', 'Uasin Gishu County',
    'Elgeyo-Marakwet County', 'Wilaya ya Uasin Gishu', 'Naivasha',
    'Nyahururu', 'Kenya', 'Nakuru ', 'Nyandarua County',
    'Taita-Taveta County', 'Kilimanjaro Region', 'Coast',
    'Mombasa County', 'Kwale', 'Kwale County', 'Wilaya ya Kwale',
    'Ukunda, Kwale County', 'Kilifi County', 'Kwale District',
    'Mombasa', 'kenya', 'Wilaya ya Mombasa', 'coast', 'Kwale ',
    'Diani', 'Kisumu County', 'Kakamega County', 'Annex', 'Kisumu',
    'Kakamega', 'Nandi County', ' Rift Valley', 'Wilaya ya Kisumu',
    'Vihiga County', 'Elgeyo Marakwet', 'Trans-Nzoia County',
    'Uasin-Gishu', 'Tharaka-Nithi County', 'Embu County',
    'Makueni County', 'Kitui County', 'Wilaya ya Makueni',
    'Wilaya ya Kakamega', 'Wilaya ya Vihiga', 'Kilifi',
    'Wilaya ya Kilifi', 'Root Node', 'Galu Beach', 'South coast',
    'Nyali', 'Mtwapa', 'Nyali estate', 'Westlands, Nairobi',
    'Kaijado County', 'Kajiado', 'Langata', 'Kiambu', 'Westlands',
    'Nairobi Area', 'Kikuyu', 'Nyamira', 'Wilaya ya Kiambu',
    'Kajiado North County', 'Kileleshwa', 'West Pokot County',
    'Turkana County', 'Isiolo County', 'Kirinyaga County',
    'Kaunti ya Meru', 'Eastern Region', 'Machakos', 'Muranga County',
    'Kenia', 'Kisaju', 'Nyali estate,', 'Homa Bay', 'Migori County',
    'Bungoma County', 'Busia County', 'Siaya', 'Bungoma',
    'western kenya', 'Mara Region', 'Kendu bay', 'Texas',
    'Nairobi City', 'Nairobi, Kenya', 'Wanyee Cl, Nairobi, Kenya',
    'Ruaka', 'South B', 'Nairobi-Upper hill Area',
    'Wilaya ya Machakos', 'Embakasi East', 'Nairobi ', 'NAIROBI',
    'Coast Province', 'Malindi', 'Provincia costiera', 'kilifi',
    'Distretto di Kilifi', 'KF', 'Watamu', 'Kilifi Province',
    'Kilifi, Watamu']
city_categories = ['Ngong', 'Nairobi', 'Ruaka', 'Pridelands', 'Nairobi City, Kenya',
    'Ongata Rongai', 'Kiambu', 'Athi River', 'Ruiru', 'Nairobi City',
    'Muthaiga North,', 'Kiambu District', 'Kitengela', 'Kiserian',
    'South', 'New Njiru Town', 'Nairobi - Lavington', 'North',
    'Limuru road', 'Mlolongo', 'Westlands', 'Kajiado',
    'Seganani Masai Mara national reserve', 'Bomet', 'Narok',
    'Highway', 'Entasekera', 'Nakuru', 'Kadongo', 'Mau Narok',
    'Lake Elmenteita', 'Ikonge', 'Litein', 'Naivasha ', 'Keroka',
    'Ololaimutiek Village', 'Naivasha', 'Kongoni', 'Gilgil', 'Kisii',
    'Talek', 'Nyanchwa Hill', 'Narok County', 'Aitong', 'Lolgorien',
    'Maasai Mara', 'Oyugis', 'Sekenani', 'Masai Mara', 'Ewaso Ngiro',
    'SEKENANI', 'Silibwet', 'Nanyuki', 'Nchiru', 'Isiolo', 'Dol Dol',
    'Wamba', 'Meru', 'Rukanga, Sagana', 'East', 'Meru District',
    'Timau', 'Maua', 'Eldoret', 'Iten', 'Elmenteita', 'Lake Naivasha',
    'Njoro', 'Eburru', 'Kasuku', 'Nakuru town', 'Voi', 'Mwatate',
    'Taveta', 'Maungu', 'Wundanyi', 'Same', 'Mombasa', 'Diani Beach',
    'Mtwapa', 'Tiwi', 'DIANI BEACH', 'Kwale', 'Ukunda', 'Nyali Beach',
    'Tiwi Beach', 'Galu Beach', 'Nyali Mombasa', 'Nyali',
    'Diani Beach ', 'Diani', 'Diani Beach Road', 'Kisumu',
    'Isukha ICHINA', 'Kakamega', 'Milimani', 'Kapsabet', 'Kisumu City',
    'Gisambai', 'Kapseret', 'Kitale', 'Vihiga', 'Soy', 'Malava',
    'Naro Moru, Nanyuki', 'Naro Moru', 'Ol Kalou', 'Rumuruti',
    'Nanyuki ', 'Nyahururu', 'Laikipia', 'Chuka', 'Gatunga', 'Siakago',
    'Chogoria', 'Igoji', 'Machakos', 'ndagani', 'Matuu', 'Wote',
    'Karurumo', 'Matinyani', 'Mutomo', 'Kitui', 'Mtito Andei',
    'Syongila', 'Kibwezi', 'Luanda', 'Chavakali', 'Maragoli', 'Kilifi',
    'Malindi', 'Kikambala', 'Mtwapa, Mombasa', 'Kaloleni',
    'Mida Creek', 'Gede', 'Watamu', 'Malindi - Mambrui', 'Gongoni',
    'Mtwapa Creek', 'Mariakani', 'Msambweni', 'Mambrui',
    'Utange-Mombasa ', 'Waa', 'Mombasa Bamburi Beach', 'Bamburi',
    'Off Diani Beach Road', 'Diane', 'Galu Kinondo Beach', 'Shimoni',
    'Wasini Island', 'Mombasa ', 'Mombasa Kenya, Box 42961-80100',
    'Kikuyu', 'Limuru', 'Kaijado', 'Tigoni', 'Juja', 'Olooloitikosh',
    'Tigoni Dam', 'Kiserian, Rift Valley, KE', 'Karen Nairobi',
    'Karen', ' Mombasa Road', 'Githurai', 'Ngong Hills', 'Karen/Hardy',
    'Embakasi', 'Ngenda', 'Magadi', 'Limuru Town', 'Thika', 'Ndenderu',
    'Ruaka Town', 'Kahawa Sukari', 'Underpass', 'Rironi',
    'Banana Hill', 'Nyamira', 'Tatu City', 'Limuru Town.', 'Mnagei',
    'Makutano', 'Lokichar', 'Mount Kenya', 'Nanyuki - Timau',
    'Archers Post', 'Ruiri Town', 'Malili', 'Kathonzweni', 'Kimana',
    'Merrueshi', 'Sultan Hamud', 'Nkubu', "Murang'a", 'Kutus',
    'Gaichanjiru', 'Gitugi', 'Kabati', 'Tuthu', 'Kagio', 'Sagana',
    'Ndakaini', 'Syokimau', 'Jacaranda Kenia ', 'Langata',
    'Rusinga Islands', 'MIrogi', 'Mbita', 'Kendu Bay', 'Homa Bay',
    'Kagan', 'Homa Bay Town', 'Muhuru', 'Nyangweso', 'Rongo', 'Suneka',
    'Sare', 'Migori', 'Sindo', 'Mfangano Island', 'Bungoma', 'Malaba',
    'Siaya', 'Chwele', 'Shianda', 'Mumias', 'Miendo', 'Webuye',
    'Kisian', 'Gucha', 'Kilimani', 'Batians Lane', 'Nyeri', 'Kerugoya',
    'Kiriani', 'Kiganjo', 'Iria-Ini', 'Karatina', 'Embu', 'Kimunye',
    'Kibugu', 'Kirinyaga District', 'Runyenjes Town', 'Runyenjes',
    'Matayos', 'Ugunja', 'Busia', 'Funyula', 'Tarime', 'Kericho',
    'Longisa', 'Usenge', 'Bondo', 'Kendu bay', 'Rusinga East',
    'Asembo', 'Cedar Hill', 'Kisaju', 'South B',
    'Hurlingham kilimani Nairobi', 'Starehe', 'Muchatha', 'Elgon Road',
    'Kawaida', 'Wangige', 'Athi River ', ' Athi River',
    'Kenyatta Road', 'Isinya', 'Kangundo', 'Nairobi ', 'Tuala',
    'Mwala', 'NAIROBI ', ' Vipingo', 'Mtwapa ', 'Casuarina', 'Vipingo',
    'Kilifi County', 'Watamu ', 'Uyombo', 'Ватаму', 'Takaungu',
    'kilifi creek', 'Kenya', 'Takaungu Creek', 'Kilifi ',
    'Kilifi, Watamu ', 'Mayungu', 'NYANDARUA ', 'Aberdare Range']



## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
    ]
)

# Categorigal Pipeline

cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[roomType_categories,city_categories,state_categories,bathroomType_categories])),
    ('scaler',StandardScaler())
    ]
)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [153]:
train_df = pd.read_csv("datafiles/train.csv")

target_column_name = 'price'
drop_columns = [target_column_name]

X = train_df.drop(columns=drop_columns,axis=1)
y=train_df[target_column_name]



In [154]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [155]:
X.columns

Index(['numberOfGuests', 'roomType', 'maxNights', 'minNights', 'city', 'state',
       'latitude', 'longitude', 'numberOfBedsAvailable', 'numberOfBedrooms',
       'allowsChildren', 'allowsEvents', 'allowsPets', 'allowsSmoking',
       'allowsInfants', 'review_count', 'amenities', 'regular_bed',
       'relaxing_bed', 'kids_bed', 'numberofbathroom', 'bathroomType'],
      dtype='object')

In [156]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [157]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [158]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [159]:

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE: 188.73879333934198
MAE: 72.5866673790463
R2 score 7.380813668166097


Lasso
Model Training Performance
RMSE: 188.07627103442914
MAE: 70.93749497600128
R2 score 8.029907338205678


Ridge
Model Training Performance
RMSE: 188.7350855063637
MAE: 72.57472353816009
R2 score 7.384452698533561


Elasticnet
Model Training Performance
RMSE: 188.37590497246535
MAE: 69.95839331741885
R2 score 7.736629377986503




In [160]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet']