In [1]:
import pandas as pd 

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
reviews = pd.read_csv('reviews.csv')



In [2]:
train.head(2)

Unnamed: 0,name,summary,space,description,experiences_offered,neighborhood_overview,transit,house_rules,picture_url,host_id,...,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,cancellation_policy,reviews_per_month,listing_id
0,Attic Room in historic Greenwich,The room does not have a door but its own disc...,Double room available in historic royal Greenw...,The room does not have a door but its own disc...,business,,Parking is restricted in London. Visitors park...,no shoes on carpets no food in attic room no o...,https://a0.muscache.com/im/pictures/20045164/7...,2167992,...,95.0,10.0,10.0,10.0,10.0,10.0,10.0,strict_14_with_grace_period,2.37,9Q1RD6H7IO
1,Lovely Garden Studio Private Access,"Garden studio with private entrance, 5 minutes...","Beautiful studio with a king size bed, a sofa ...","Garden studio with private entrance, 5 minutes...",none,Crouch End is a very hip and friendly neighbou...,Walk across the street to the W3 bus stop and ...,We'd love it if you would remove your shoes at...,https://a0.muscache.com/im/pictures/8d1ddcdc-f...,16148175,...,99.0,10.0,10.0,10.0,10.0,10.0,10.0,moderate,2.97,6XDPZPGLSR


In [3]:
reviews.head(2)

Unnamed: 0,reviewer_id,comments,review_id,listing_id
0,93896,"The flat was bright, comfortable and clean and...",30672,8Z94Z3WMCO
1,97890,We stayed with Adriano and Valerio for a week ...,32236,8Z94Z3WMCO


## EDA - feature engineering, text extraction

In [4]:
print(train.shape)
print(train.info())

num_cols = []
text_cols = []

for c in train.columns:
    if train[c].dtypes in ['int64','float64']:
        num_cols.append(c)
    elif train[c].dtypes == 'O':
        text_cols.append(c)
    else:
        pass
print(num_cols)
print(text_cols)

(55284, 42)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55284 entries, 0 to 55283
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   name                         55270 non-null  object 
 1   summary                      52330 non-null  object 
 2   space                        38403 non-null  object 
 3   description                  53558 non-null  object 
 4   experiences_offered          55284 non-null  object 
 5   neighborhood_overview        35778 non-null  object 
 6   transit                      35477 non-null  object 
 7   house_rules                  31906 non-null  object 
 8   picture_url                  55284 non-null  object 
 9   host_id                      55284 non-null  int64  
 10  host_since                   55173 non-null  object 
 11  host_response_time           37482 non-null  object 
 12  host_response_rate           37482 non-null  object 
 13  host

In [5]:
descriptive_cols = ['name', 'summary', 'space', 'description','neighborhood_overview', 'transit', 'house_rules','neighbourhood', 'neighbourhood_cleansed']
boolean_cols = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified']
categorical_cols = ['property_type', 'room_type', 'bed_type','experiences_offered','cancellation_policy']
other_cols = ['picture_url','host_since', 'host_response_time', 'host_response_rate','amenities','zipcode','listing_id']

assert len(text_cols) == len(descriptive_cols)+len(boolean_cols)+len(categorical_cols)+len(other_cols), f"You missed {set(text_cols).difference(set(descriptive_cols+binary_cols+categorical_cols+other_cols))}"

In [6]:
feature_df = train.copy()

#### Treatment of boolean columns - aim is to convert them to integers

In [83]:
for c in boolean_cols:
    print(train[c].unique().tolist())

['t', 'f', nan]
['t', 'f', nan]
['f', 't', nan]


In [84]:
binary_dict = {'t':1,'f':0}

def to_binary(df):
    for c in boolean_cols:
        df[c] = df[c].map(binary_dict)
    return df

#### Treatment of categorical columns - aim is to find an ordinal pattern


In [144]:
for c in categorical_cols:
    print(c+" : "+",".join(train[c].unique().tolist()))

property_type : Townhouse,Guesthouse,Apartment,House,Condominium,Loft,Serviced apartment,Cabin,Other,Guest suite,Bed and breakfast,Cottage,Tiny house,Houseboat,Parking Space,Bungalow,Farm stay,Hotel,Camper/RV,Boutique hotel,Villa,Boat,Hostel,Yurt,Aparthotel,Hut,Earth house,Chalet,Plane,Barn,Treehouse,Island,Bus,Dome house,Windmill,Campsite,Tent,Lighthouse
room_type : Private room,Entire home/apt,Shared room,Hotel room
bed_type : Pull-out Sofa,Real Bed,Futon,Couch,Airbed
experiences_offered : business,none,romantic,family,social
cancellation_policy : strict_14_with_grace_period,moderate,flexible,super_strict_60,super_strict_30,strict,luxury_moderate,luxury_super_strict_95


In [18]:
train.house_rules.value_counts()

- The lead guest will be required to sign an Occupation Agreement prior to arrival. This is for insurance purposes to say that the guests will look after the property and not smoke or have parties - Please note that the lead guest must be over 25 years old - Normal check in time is between 3pm and 9pm. Any time outside of this period may incur an additional fee - No pets, no parties or smoking, please                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [14]:
room_type_dict = {'Private room':2,'Entire home/apt':3,'Shared room':0,'Hotel room':1}
bed_type_dict = {}

55284

#### Treatment of other columns - aim is to create meaningful features as many as possible

In [26]:
### host related
import re
for c in ['host_response_time', 'host_response_rate']:
    print(train[c].unique())

# train.host_response_time.value_counts()
host_response_time_dict = {['within a few hours' ,'within a day', 'a few days or more']:0,'within an hour':1}

def encode_host_response_rate(df):
    temp = []
    for v in df.host_response_rate.tolist():
        try:
            num = re.search(r'\d+',v).group(0)
            if int(num)>=90:
                temp.append(1)
            else:
                temp.append(0)
        except:
            temp.append(v)
            pass
    return temp


['within an hour' 'within a few hours' nan 'within a day'
 'a few days or more']
['100%' nan '90%' '70%' '86%' '89%' '95%' '99%' '94%' '96%' '0%' '71%'
 '80%' '98%' '75%' '93%' '97%' '92%' '83%' '91%' '67%' '40%' '82%' '79%'
 '55%' '88%' '50%' '81%' '57%' '56%' '47%' '60%' '85%' '84%' '43%' '29%'
 '74%' '73%' '20%' '87%' '44%' '78%' '38%' '33%' '30%' '25%' '63%' '69%'
 '54%' '58%' '64%' '68%' '46%' '72%' '31%' '65%' '10%' '23%' '41%' '77%'
 '17%' '13%' '61%' '76%' '62%' '14%' '11%' '22%' '27%' '42%']


In [48]:
### host_since

import datetime

def to_recency(df):
    temp = []
    for d in df.host_since.tolist():
        try:
            year = 2022 - datetime.datetime.strptime(d,'%Y-%m-%d').year
            temp.append(year)
        except:
            temp.append(d)
            pass
    return temp



In [71]:
### amenities
from nltk.probability import FreqDist

def has_top_five_amenities(df):
    amenities = df['amenities'].tolist()
    t = ','.join(t).replace('{','').replace('}','').replace('"','')
    top_five = [item for (item,_) in FreqDist(t.split(',')).most_common(5)]
    temp = []
    for v in amenities:
        temp = v.replace('{','').replace('}','').replace('"','').split(',')
        temp.append(1 if all([x for x in top_five if x in temp]) else 0)
    return temp


##### Treatment of Descriptive Columns

In [75]:
list(descriptive_cols)

['name',
 'summary',
 'space',
 'description',
 'experiences_offered',
 'neighborhood_overview',
 'transit',
 'house_rules',
 'neighbourhood',
 'neighbourhood_cleansed',
 'cancellation_policy']

In [None]:
import numpy as np

def is_transit_detailed(df):
    len_checker = lambda x:len(x.split())
    temp = []
    for x in df.transit.tolist():
        flag = 0
        if pd.notnull(x) & (len_checker(x)>3):
            flag = 1
        temp.append(flag)
    return temp



In [150]:
train.space[1]

'Beautiful studio with a king size bed, a sofa & coffee table.  Little kitchen area, equipped with a toaster, kettle, fridge, and cups, bowls, cutlery plates & biscuits! Light, self-service breakfast provided. Nice lighting and art, and a place for your suitcase and to hang your clothes. Blackout curtains for peaceful sleep.'

In [10]:
train.description[1]

Garden studio with private entrance, 5 minutes from Crouch End & its many shops & restaurants. All modern amenities - free high speed wifi, FreeviewTV, High pressure shower, comfy King bed, toaster/kettle/fridge. Lots of room and very quiet! Beautiful studio with a king size bed, a sofa & coffee table.  Little kitchen area, equipped with a toaster, kettle, fridge, and cups, bowls, cutlery plates & biscuits! Light, self-service breakfast provided. Nice lighting and art, and a place for your suitcase and to hang your clothes. Blackout curtains for peaceful sleep. Once you have checked in to collect keys, you will have private access to your studio via a side entrance gate to our garden. We will be here to greet you and get you set up in your temporary home.  Happy to recommend local eateries and provide transport tips. Crouch End is a very hip and friendly neighbourhood.  It is filled with endless options for eating out and lots of lovely shops! Great Vietnamese restaurant Khoai Cafe, is

In [126]:
train.summary[1]

'Garden studio with private entrance, 5 minutes from Crouch End & its many shops & restaurants. All modern amenities - free high speed wifi, FreeviewTV, High pressure shower, comfy King bed, toaster/kettle/fridge. Lots of room and very quiet!'

In [127]:
train.name[1]

'Lovely Garden Studio Private Access'

In [130]:
train.neighborhood_overview[1]

"Crouch End is a very hip and friendly neighbourhood.  It is filled with endless options for eating out and lots of lovely shops! Great Vietnamese restaurant Khoai Cafe, is a favourite, as is Irvin (Italian & Scottish Tapas bar!) run by the former drummer of Lloyd Cole & the Commotions. New, amazing  ice cream shop La Gelatiera will have you licking your lips and coming back for more! - but you might have to queue! Nearby Priory Road (with a lovely leafy Park just 1 min walk away) The Crouch End Broadway and surrounding streets offer a good selection of shops, trendy cafés and restaurants to choose from. Banner’s is a particularly favoured American-style restaurant on Park Road, with a dimly lit dining area and cool choice of music. The King’s Head is a popular gastro pub with a downstairs comedy club, a popular choice with local residents. Owner's favourites: The Haberdashery is a lovely cafe on Middle Lane (5 min walk, reminiscent of NYC's Greenwich Village, which serves Italian-styl

In [7]:
temp = ' '.join(train.description.dropna().tolist())
len(temp.split('.'))

403585

In [15]:
train.neighborhood_overview.isnull().sum()

19506

In [16]:
train[train.description=='']

Unnamed: 0,name,summary,space,description,experiences_offered,neighborhood_overview,transit,house_rules,picture_url,host_id,...,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,cancellation_policy,reviews_per_month,listing_id
