In [1]:
import pandas as pd 

train = pd.read_csv('train.csv', encoding='utf-8')
test = pd.read_csv('test.csv', encoding='utf-8')
reviews = pd.read_csv('reviews.csv', encoding='utf-8')



In [2]:
train.price.isnull().sum()

0

In [3]:
train.head(2)

Unnamed: 0,name,summary,space,description,experiences_offered,neighborhood_overview,transit,house_rules,picture_url,host_id,...,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,cancellation_policy,reviews_per_month,listing_id
0,Attic Room in historic Greenwich,The room does not have a door but its own disc...,Double room available in historic royal Greenw...,The room does not have a door but its own disc...,business,,Parking is restricted in London. Visitors park...,no shoes on carpets no food in attic room no o...,https://a0.muscache.com/im/pictures/20045164/7...,2167992,...,95.0,10.0,10.0,10.0,10.0,10.0,10.0,strict_14_with_grace_period,2.37,9Q1RD6H7IO
1,Lovely Garden Studio Private Access,"Garden studio with private entrance, 5 minutes...","Beautiful studio with a king size bed, a sofa ...","Garden studio with private entrance, 5 minutes...",none,Crouch End is a very hip and friendly neighbou...,Walk across the street to the W3 bus stop and ...,We'd love it if you would remove your shoes at...,https://a0.muscache.com/im/pictures/8d1ddcdc-f...,16148175,...,99.0,10.0,10.0,10.0,10.0,10.0,10.0,moderate,2.97,6XDPZPGLSR


In [4]:
reviews.head(2)

Unnamed: 0,reviewer_id,comments,review_id,listing_id
0,93896,"The flat was bright, comfortable and clean and...",30672,8Z94Z3WMCO
1,97890,We stayed with Adriano and Valerio for a week ...,32236,8Z94Z3WMCO


## EDA - feature engineering, text extraction

In [5]:
print(train.shape)
print(train.info())

num_cols = []
text_cols = []

for c in train.columns:
    if train[c].dtypes in ['int64','float64']:
        num_cols.append(c)
    elif train[c].dtypes == 'O':
        text_cols.append(c)
    else:
        pass
print(num_cols)
print(text_cols)

(55284, 42)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55284 entries, 0 to 55283
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   name                         55270 non-null  object 
 1   summary                      52330 non-null  object 
 2   space                        38403 non-null  object 
 3   description                  53558 non-null  object 
 4   experiences_offered          55284 non-null  object 
 5   neighborhood_overview        35778 non-null  object 
 6   transit                      35477 non-null  object 
 7   house_rules                  31906 non-null  object 
 8   picture_url                  55284 non-null  object 
 9   host_id                      55284 non-null  int64  
 10  host_since                   55173 non-null  object 
 11  host_response_time           37482 non-null  object 
 12  host_response_rate           37482 non-null  object 
 13  host

In [6]:
descriptive_cols = ['name', 'summary', 'space', 'description','neighborhood_overview', 'transit', 'house_rules','neighbourhood', 'neighbourhood_cleansed']
boolean_cols = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified']
categorical_cols = ['property_type', 'room_type', 'bed_type','experiences_offered','cancellation_policy']
other_cols = ['picture_url','host_since', 'host_response_time', 'host_response_rate','amenities','zipcode','listing_id']

assert len(text_cols) == len(descriptive_cols)+len(boolean_cols)+len(categorical_cols)+len(other_cols), f"You missed {set(text_cols).difference(set(descriptive_cols+binary_cols+categorical_cols+other_cols))}"

In [7]:
feature_df = train.copy()
test_df = test.copy()

### Fix Text Columns

In [8]:
id_col = ['listing_id']
cols_to_remove = ['picture_url','zipcode','neighbourhood','neighbourhood_cleansed','transit','house_rules','experiences_offered']

#### Treatment of boolean columns - aim is to convert them to integers

In [9]:
def encode_binary_var(df):
    binary_dict = {'t':1,'f':0}
    for c in boolean_cols:
        df[c] = df[c].map(binary_dict)
    return df

#### Treatment of categorical columns - aim is to find an ordinal pattern


In [10]:
for c in categorical_cols:
    print(c+" : "+",".join(train[c].unique().tolist()))

property_type : Townhouse,Guesthouse,Apartment,House,Condominium,Loft,Serviced apartment,Cabin,Other,Guest suite,Bed and breakfast,Cottage,Tiny house,Houseboat,Parking Space,Bungalow,Farm stay,Hotel,Camper/RV,Boutique hotel,Villa,Boat,Hostel,Yurt,Aparthotel,Hut,Earth house,Chalet,Plane,Barn,Treehouse,Island,Bus,Dome house,Windmill,Campsite,Tent,Lighthouse
room_type : Private room,Entire home/apt,Shared room,Hotel room
bed_type : Pull-out Sofa,Real Bed,Futon,Couch,Airbed
experiences_offered : business,none,romantic,family,social
cancellation_policy : strict_14_with_grace_period,moderate,flexible,super_strict_60,super_strict_30,strict,luxury_moderate,luxury_super_strict_95


In [11]:
#### Encode Ordinal - Interpretable

room_type_dict = {'Private room':2,'Entire home/apt':3,'Shared room':0,'Hotel room':1}

cancellation_policy_dict = {'strict_14_with_grace_period':1,
                        'moderate':2,
                        'flexible':2,
                        'super_strict_60':0,
                        'super_strict_30':0,
                        'strict':1,
                        'luxury_moderate':2,
                        'luxury_super_strict_95':0
                        }


#### Encode Nominal - Relative Frequency Encoded




#### Combining Them 

def encode_categorical_var(df):
    for c in categorical_cols:
        if c not in ['room_type','cancellation_policy']:
            df[c] = df[c].map(df[c].dropna().value_counts(normalize=True).to_dict())
        else:
            df['room_type'] = df.room_type.map(room_type_dict)
            df['cancellation_policy'] = df.cancellation_policy.map(cancellation_policy_dict)
    return df


#### Treatment of other columns - aim is to create meaningful features as many as possible

In [12]:
train.amenities[1].replace('{','').replace('}','').replace('"','')

'TV,Wifi,Free parking on premises,Breakfast,Free street parking,Heating,First aid kit,Fire extinguisher,Essentials,Shampoo,Hangers,Hair dryer,Iron,Laptop friendly workspace,translation missing: en.hosting_amenity_49,translation missing: en.hosting_amenity_50,Hot water,Coffee maker,Refrigerator,Dishes and silverware,Host greets you'

In [13]:
### amenities

from nltk.probability import FreqDist

def has_top_five_amenities(df):
    amenities = df['amenities'].tolist()
    t = ','.join([t.replace('{','').replace('}','').replace('"','') for t in amenities ])
    top_five = [item for (item,_) in FreqDist(t.split(',')).most_common(5)]
    temp = []
    print(len(amenities))
    for v in amenities:
        v = v.replace('{','').replace('}','').replace('"','').split(',')
        temp.append(1 if all([x for x in top_five if x in v]) else 0)
    return temp


In [14]:
import datetime

def encode_host_since(df):
    temp = []
    for d in df.host_since.tolist():
        try:
            year = 2022 - datetime.datetime.strptime(d,'%Y-%m-%d').year
            temp.append(year)
        except:
            temp.append(d)
            pass
    return temp


def encode_host_response_time(df):
    temp = []
    for x in df.host_response_time.tolist():
        if x == 'within an hour':
            temp.append(1)
        elif pd.isna(x):
            temp.append(np.nan)
        else:
            temp.append(0)
    return temp



def encode_host_response_rate(df):
    temp = []
    for v in df.host_response_rate.tolist():
        try:
            num = re.search(r'\d+',v).group(0)
            if int(num)>=90:
                temp.append(1)
            else:
                temp.append(0)
        except:
            temp.append(v)
            pass
    return temp




def encode_other_var(df):
    df['host_since'] = encode_host_since(df)
    df['host_response_rate'] = encode_host_response_rate(df)
    df['host_response_time'] = encode_host_response_time(df)
    df['amenities'] = has_top_five_amenities(df)
    return df





##### Treatment of Descriptive Columns

In [15]:
list(descriptive_cols)

['name',
 'summary',
 'space',
 'description',
 'neighborhood_overview',
 'transit',
 'house_rules',
 'neighbourhood',
 'neighbourhood_cleansed']

In [16]:
import numpy as np

def is_transit_detailed(df):
    len_checker = lambda x:len(x.split())
    temp = []
    for x in df.transit.tolist():
        flag = 0
        if pd.notnull(x) & (len_checker(x)>3):
            flag = 1
        temp.append(flag)
    return temp



In [17]:
train.space[1]

'Beautiful studio with a king size bed, a sofa & coffee table.  Little kitchen area, equipped with a toaster, kettle, fridge, and cups, bowls, cutlery plates & biscuits! Light, self-service breakfast provided. Nice lighting and art, and a place for your suitcase and to hang your clothes. Blackout curtains for peaceful sleep.'

In [18]:
train.description[1]

'Garden studio with private entrance, 5 minutes from Crouch End & its many shops & restaurants. All modern amenities - free high speed wifi, FreeviewTV, High pressure shower, comfy King bed, toaster/kettle/fridge. Lots of room and very quiet! Beautiful studio with a king size bed, a sofa & coffee table.  Little kitchen area, equipped with a toaster, kettle, fridge, and cups, bowls, cutlery plates & biscuits! Light, self-service breakfast provided. Nice lighting and art, and a place for your suitcase and to hang your clothes. Blackout curtains for peaceful sleep. Once you have checked in to collect keys, you will have private access to your studio via a side entrance gate to our garden. We will be here to greet you and get you set up in your temporary home.  Happy to recommend local eateries and provide transport tips. Crouch End is a very hip and friendly neighbourhood.  It is filled with endless options for eating out and lots of lovely shops! Great Vietnamese restaurant Khoai Cafe, i

In [19]:
train.summary[1]

'Garden studio with private entrance, 5 minutes from Crouch End & its many shops & restaurants. All modern amenities - free high speed wifi, FreeviewTV, High pressure shower, comfy King bed, toaster/kettle/fridge. Lots of room and very quiet!'

In [20]:
train.name[1]

'Lovely Garden Studio Private Access'

In [21]:
train.neighborhood_overview[1]

"Crouch End is a very hip and friendly neighbourhood.  It is filled with endless options for eating out and lots of lovely shops! Great Vietnamese restaurant Khoai Cafe, is a favourite, as is Irvin (Italian & Scottish Tapas bar!) run by the former drummer of Lloyd Cole & the Commotions. New, amazing  ice cream shop La Gelatiera will have you licking your lips and coming back for more! - but you might have to queue! Nearby Priory Road (with a lovely leafy Park just 1 min walk away) The Crouch End Broadway and surrounding streets offer a good selection of shops, trendy cafés and restaurants to choose from. Banner’s is a particularly favoured American-style restaurant on Park Road, with a dimly lit dining area and cool choice of music. The King’s Head is a popular gastro pub with a downstairs comedy club, a popular choice with local residents. Owner's favourites: The Haberdashery is a lovely cafe on Middle Lane (5 min walk, reminiscent of NYC's Greenwich Village, which serves Italian-styl

In [22]:
temp = ' '.join(train.description.dropna().tolist())
len(temp.split('.'))

403585

In [23]:
train[[c for c in descriptive_cols]].isnull().sum()/len(train) *100


name                       0.025324
summary                    5.343318
space                     30.535055
description                3.122061
neighborhood_overview     35.283265
transit                   35.827726
house_rules               42.287099
neighbourhood              0.265900
neighbourhood_cleansed     0.000000
dtype: float64

In [24]:
tmp = train.loc[train.space.isnull()&train.summary.notnull(),['name','description','summary','neighborhood_overview']]
print(tmp.description.iloc[1])
print()
print(tmp.summary.iloc[1])
print()
print(tmp.name.iloc[1])
print()
print(tmp.neighborhood_overview.iloc[1])


A perfectly situated, light-filled, modern flat is located, just off the Kings Road, overlooking Duke of York square. The recently refurbished flat is comprised of a spacious double bedroom and a living room with an open kitchen and a modern bathroom (with a bath and power shower).  The flat has plenty of storage, with a wall of floor to ceiling wardrobes providing plenty of hanging and shelving space - Sky TV and Internet included - 10 night minimum stay. On the King's Road, minutes on foot from Harrods, Knightsbridge, Sloane Street, you are in shoppers heaven with endless wonderful restaurants and cafes right on your doorstep.  Sloane Square and South Kensington tubes are just five minutes walk. Buckingham Palace, several museums and art galleries are also in walking distance so this makes the perfect base to explore all the wonderful delights that London has to offer. THE LOCATION: * 5 min walk to Sloane Square station  * Luxurious shops, supermarkets, leisure centres and bars/resta

In [25]:
import re
import nltk
import string
import gensim
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
lemmatizer = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

def prepare_cleaned_text(text):
    if pd.notna(text):
        text = text.lower()
        text = gensim.utils.simple_preprocess(text)
        text = [lemmatizer.lemmatize(t) for t in text if t not in stopwords]
    return text

def preprocess_descriptive_cols(df):
    df['summary'] = df.apply(lambda x:prepare_cleaned_text(x['summary']),axis=1)
    df['neighborhood_overview'] = df.apply(lambda x:prepare_cleaned_text(x['neighborhood_overview']),axis=1)
    df['space'] = df.apply(lambda x:prepare_cleaned_text(x['space']),axis=1)
    df['name'] = df.apply(lambda x:prepare_cleaned_text(x['name']),axis=1)
    df = df.drop(columns=['description'])
    return df


In [26]:
#### Combine all the text features

def prepare_text_data(df):
    df = encode_binary_var(df)
    df = encode_categorical_var(df)
    df = encode_other_var(df)
    df = preprocess_descriptive_cols(df)
    df = df.drop(columns=cols_to_remove)
    return df

### Fix Numerics

In [27]:
num_cols
cols_to_drop = ['host_id','latitude','longitude']
label_col = ['price']
review_cols = [c for c in  num_cols if 'review' in c]
remaining_cols = [c for c in num_cols if c not in cols_to_drop+label_col+review_cols]
print(remaining_cols)

['host_total_listings_count', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'guests_included']


In [28]:
review_cols = [c for c in  num_cols if 'review' in c]
train[[c for c in review_cols]].head(10)

Unnamed: 0,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
0,95.0,10.0,10.0,10.0,10.0,10.0,10.0,2.37
1,99.0,10.0,10.0,10.0,10.0,10.0,10.0,2.97
2,84.0,9.0,9.0,10.0,10.0,9.0,8.0,1.73
3,,,,,,,,
4,100.0,10.0,10.0,10.0,10.0,10.0,10.0,2.32
5,70.0,10.0,5.0,10.0,10.0,9.0,8.0,0.23
6,,,,,,,,0.47
7,,,,,,,,
8,,,,,,,,
9,96.0,10.0,10.0,10.0,10.0,9.0,10.0,2.31


In [29]:
train[[c for c in remaining_cols]].isnull().sum()
# train[[c for c in remaining_cols]].head(10)


host_total_listings_count    111
accommodates                   0
bathrooms                     70
bedrooms                      62
beds                         262
guests_included                0
dtype: int64

In [30]:
from statistics import mean, median, mode

def preprocess_review_scores(df):
    review_cols = [c for c in  num_cols if 'review' in c]
    for c in review_cols:
        if c == 'reviews_per_month':
            imputed = mean(df[c].dropna().values)
        else:
            imputed = median(df[c].dropna().values)
        df[c].fillna(imputed,inplace=True)
    return df

def prepare_numeric_data(df):
    df.drop(columns=cols_to_drop,inplace=True)
    df = preprocess_review_scores(df)
    for c in remaining_cols:
        df[c] = df[c].fillna(median(df[c].dropna()))
    return df

    


### Combine all features

In [42]:
feature_df = train.copy()
test_df = test.copy()

In [44]:
def preprocess(df):
    df = prepare_numeric_data(df)
    df = prepare_text_data(df)
    df = df.drop(columns=['room_type','cancellation_policy'])
    return df


feature_df = preprocess(feature_df)
feature_df['flag'] = 'train'
test_df = preprocess(test_df)
test_df['flag'] = 'test'


55284
29769


In [46]:
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55284 entries, 0 to 55283
Data columns (total 30 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   name                         55270 non-null  object 
 1   summary                      52330 non-null  object 
 2   space                        38403 non-null  object 
 3   neighborhood_overview        35778 non-null  object 
 4   host_since                   55173 non-null  float64
 5   host_response_time           37482 non-null  float64
 6   host_response_rate           37482 non-null  float64
 7   host_is_superhost            55173 non-null  float64
 8   host_total_listings_count    55284 non-null  float64
 9   host_has_profile_pic         55173 non-null  float64
 10  host_identity_verified       55173 non-null  float64
 11  property_type                55284 non-null  float64
 12  accommodates                 55284 non-null  int64  
 13  bathrooms       

In [59]:
test_df['price']=0
cols = feature_df.columns.tolist()
test_df = test_df[[c for c in cols]]

In [60]:
data_df = pd.concat([feature_df,test_df])
data_df.to_pickle('preprocess_data.pkl')

In [41]:
data_df.to_csv('preprocess_data.csv',index=False)

In [61]:
import pandas as pd
data_df = pd.read_pickle('preprocess_data.pkl')
print(data_df.shape)
print(data_df.isnull().sum().sort_values(ascending=False))

(85053, 30)
neighborhood_overview          30057
host_response_time             27374
host_response_rate             27374
space                          25938
summary                         4531
host_has_profile_pic             176
host_identity_verified           176
host_since                       176
host_is_superhost                176
name                              24
review_scores_rating               0
reviews_per_month                  0
review_scores_value                0
review_scores_location             0
review_scores_communication        0
listing_id                         0
review_scores_checkin              0
review_scores_cleanliness          0
review_scores_accuracy             0
beds                               0
guests_included                    0
price                              0
amenities                          0
bed_type                           0
bedrooms                           0
bathrooms                          0
accommodates              