In [1]:
import numpy as np
import pandas as pd
from selenium import webdriver
from time import time, sleep
from bs4 import BeautifulSoup
import requests
import itertools

# Data Cleaning

## First I will clean the ratings column to obtain to separate columns, the rating and the number of reviews

In [24]:
lettings = pd.read_csv('lettings_with_price3')

In [85]:
lettings.dropna(subset=['Rating'], inplace=True)

In [86]:
lettings.shape

(1795, 6)

In [88]:
lettings['rating'] = lettings['Rating'].str.split('out of',n=1).str[0]

In [89]:
new_ratings = []

for i in lettings['rating']:
    if '\n' in i:
        new_ratings.append(None)
    else:
        new_ratings.append(i)

In [90]:
lettings['new_ratings'] = new_ratings

In [91]:
lettings.dropna(subset=['new_ratings'], inplace=True)

In [93]:
lettings.shape

(1582, 8)

In [94]:
lettings['rating'] = lettings['Rating'].str.split(' ',n=1).str[0]

In [96]:
lettings['rating'] = pd.to_numeric(lettings['rating'])

In [101]:
lettings.Rating[2]

'4.87 out of 5 stars from 164 reviews\n4.87 (164 reviews)'

In [105]:
lettings['number_of_reviews'] = lettings['Rating'].str.split(' ',n=7).str[6]

In [106]:
lettings['number_of_reviews'] = pd.to_numeric(lettings['number_of_reviews'])

In [111]:
lettings.to_csv('lettings_with_price4', index=False)

## Next, I will clean the info column, to extract the room type, number of bedrooms, bathrooms and guests

### Accomodation Type

In [116]:
lettings.head()

Unnamed: 0,Title,info,Rating,reviews,price_per_night,no_amenities,rating,new_ratings,number_of_reviews
0,The Studio in West Ealing London,Entire guest suite hosted by Tania\n2 guests ·...,4.69 out of 5 stars from 275 reviews\n4.69 (27...,['Jessica\nApril 2020\nA very compact space cl...,£45/ night,Show all 28 amenities,4.69,4.69,275.0
2,LAST MINUTE - SINGLE ROOM,Private room in house hosted by Jelena\n1 gues...,4.87 out of 5 stars from 164 reviews\n4.87 (16...,['Richard\nMarch 2020\nAll you need for a stay...,£29/ night,Show all 25 amenities,4.87,4.87,164.0
3,Homely room in leafy Ealing.,Private room in house hosted by Tina\n1 guest ...,4.75 out of 5 stars from 333 reviews\n4.75 (33...,['Brian\nMarch 2020\nSecond time staying with ...,£29/ night,Show all 17 amenities,4.75,4.75,333.0
4,"Small single room, West London, budget travel!",Private room in house hosted by Rashpal\n1 gue...,4.68 out of 5 stars from 28 reviews\n4.68 (28 ...,"[""Czarek\nFebruary 2020\nAnother great stay at...",£21/ night,Show all 26 amenities,4.68,4.68,28.0
5,Cosy Double Room,Private room in house hosted by Jesika\n1 gues...,4.78 out of 5 stars from 39 reviews\n4.78 (39 ...,['Dory\nMarch 2020\nPerfect\nDory\nFebruary 20...,£21/ night,Show all 17 amenities,4.78,4.78,39.0


In [118]:
lettings['accomodation_type'] = lettings['info'].str.split('hosted',n=2).str[0]

In [122]:
lettings.accomodation_type.value_counts()

Private room in house                      413
Entire flat                                392
Private room in flat                       320
Room in hostel                              57
Private room in townhouse                   56
Shared room in hostel                       35
Entire house                                34
Entire serviced apartment                   26
Private room in guest suite                 22
Private room in condominium                 19
Private room in bungalow                    18
Entire apartment                            18
Entire guest house                          17
Room in bed and breakfast                   15
Private room in bed and breakfast           14
Room in boutique hotel                      13
Entire guest suite                          13
Entire loft                                 12
Private room in guest house                 10
Entire townhouse                             9
Room in hotel                                9
Shared room i

In [127]:
lettings.to_csv('lettings_with_price4', index=False)

### Cleaning the number of guests

In [112]:
import re

In [146]:
lettings['info'][0]

'Entire guest suite hosted by Tania\n2 guests · Studio · 1 bed · 1 bathroom'

In [128]:
guest_pattern = re.compile('(-?\d+(?:\.\d+)?\sguest)')

In [129]:
number_of_guests = []
for i in lettings['info']:
    try:
        number_of_guests.append(re.findall(guest_pattern, i))
    except:
        number_of_guests.append(np.nan)

In [130]:
lettings['number_of_guests'] = number_of_guests

In [131]:
lettings['number_of_guests'].astype(str)

0       ['2 guest']
2       ['1 guest']
3       ['1 guest']
4       ['1 guest']
5       ['1 guest']
           ...     
2206    ['2 guest']
2208    ['2 guest']
2209    ['3 guest']
2211    ['2 guest']
2217    ['2 guest']
Name: number_of_guests, Length: 1582, dtype: object

In [132]:
lettings.dropna(subset=['number_of_guests'], inplace=True)

In [133]:
lettings.number_of_guests = lettings.number_of_guests.apply(lambda y: np.nan if len(y)==0 else y)

In [134]:
lettings.dropna(subset=['number_of_guests'], inplace=True)

In [135]:
number_guests = []
for i in lettings.number_of_guests:
    for f in i:
        try:
            number_guests.append(f)
        except:
            number_guests.append(np.nan)

In [136]:
lettings['num_guests'] = number_guests

In [137]:
lettings['num_guests'].astype(str)

0       2 guest
2       1 guest
3       1 guest
4       1 guest
5       1 guest
         ...   
2206    2 guest
2208    2 guest
2209    3 guest
2211    2 guest
2217    2 guest
Name: num_guests, Length: 1577, dtype: object

In [138]:
lettings['guests'] = lettings['num_guests'].str.split(n=1).str[0]

In [139]:
lettings.drop(['number_of_guests','num_guests'], axis=1, inplace=True)

In [140]:
lettings['guests'] = pd.to_numeric(lettings['guests'])

In [142]:
lettings.guests.value_counts()

2     738
1     369
4     164
3     143
6      58
5      39
8      21
7      17
10     10
9       8
12      7
13      2
15      1
Name: guests, dtype: int64

In [148]:
lettings.shape

(1577, 11)

In [143]:
lettings.to_csv('lettings_with_price4', index=False)

In [180]:
lettings = pd.read_csv('lettings_with_price4')

### Cleaning the number of bedrooms

In [181]:
lettings.head()

Unnamed: 0,Title,info,Rating,reviews,price_per_night,no_amenities,rating,new_ratings,number_of_reviews,accomodation_type,guests
0,The Studio in West Ealing London,Entire guest suite hosted by Tania\n2 guests ·...,4.69 out of 5 stars from 275 reviews\n4.69 (27...,['Jessica\nApril 2020\nA very compact space cl...,£45/ night,Show all 28 amenities,4.69,4.69,275.0,Entire guest suite,2
1,LAST MINUTE - SINGLE ROOM,Private room in house hosted by Jelena\n1 gues...,4.87 out of 5 stars from 164 reviews\n4.87 (16...,['Richard\nMarch 2020\nAll you need for a stay...,£29/ night,Show all 25 amenities,4.87,4.87,164.0,Private room in house,1
2,Homely room in leafy Ealing.,Private room in house hosted by Tina\n1 guest ...,4.75 out of 5 stars from 333 reviews\n4.75 (33...,['Brian\nMarch 2020\nSecond time staying with ...,£29/ night,Show all 17 amenities,4.75,4.75,333.0,Private room in house,1
3,"Small single room, West London, budget travel!",Private room in house hosted by Rashpal\n1 gue...,4.68 out of 5 stars from 28 reviews\n4.68 (28 ...,"[""Czarek\nFebruary 2020\nAnother great stay at...",£21/ night,Show all 26 amenities,4.68,4.68,28.0,Private room in house,1
4,Cosy Double Room,Private room in house hosted by Jesika\n1 gues...,4.78 out of 5 stars from 39 reviews\n4.78 (39 ...,['Dory\nMarch 2020\nPerfect\nDory\nFebruary 20...,£21/ night,Show all 17 amenities,4.78,4.78,39.0,Private room in house,1


In [196]:
lettings['info'][0]

'Entire guest suite hosted by Tania\n2 guests · Studio · 1 bed · 1 bathroom'

In [197]:
bed_pattern = re.compile('(-?\d+(?:\.\d+)?\sbed[s|\s])')

In [198]:
number_of_beds = []
for i in lettings['info']:
    try:
        number_of_beds.append(re.findall(bed_pattern, i))
    except:
        number_of_beds.append(np.nan)

In [200]:
len(number_of_beds)

1577

In [201]:
lettings['number_of_beds'] = number_of_beds

In [202]:
lettings.number_of_beds = lettings.number_of_beds.apply(lambda y: np.nan if len(y)==0 else y)

In [203]:
lettings.dropna(subset=['number_of_beds'], inplace=True)

In [204]:
number_beds = []
for i in lettings.number_of_beds:
    for f in i:
        try:
            number_beds.append(f)
        except:
            number_beds.append(np.nan)

In [205]:
lettings['num_beds'] = number_beds

lettings['num_beds'].astype(str)

0       1 bed 
1       1 bed 
2       1 bed 
3       1 bed 
4       1 bed 
         ...  
1572    1 bed 
1573    2 beds
1574    3 beds
1575    2 beds
1576    1 bed 
Name: num_beds, Length: 1572, dtype: object

In [206]:
lettings['beds'] = lettings['num_beds'].str.split(n=1).str[0]

In [207]:
lettings.drop(['number_of_beds','num_beds'], axis=1, inplace=True)

In [208]:
lettings['beds'] = pd.to_numeric(lettings['beds'])

In [209]:
lettings.shape

(1572, 12)

In [210]:
lettings.to_csv('lettings_with_price4', index=False)

### Cleaning the number of bathrooms

In [273]:
bath_pattern = re.compile('(\d+.?\d*)(?=.?share[d|\s]|.?bath|.?private)')

In [274]:
lettings['info'][15]

'Entire flat hosted by Robert\n5 guests · 2 bedrooms · 2 beds · 1.5 bathrooms'

In [275]:
number_of_baths = []
for i in lettings['info']:
    try:
        number_of_baths.append(re.findall(bath_pattern, i))
    except:
        number_of_baths.append(np.nan)

In [277]:
len(number_of_baths)

1572

In [278]:
lettings['bathrooms'] = number_of_baths

In [279]:
lettings.bathrooms = lettings.bathrooms.apply(lambda y: np.nan if len(y)==0 else y)

In [280]:
lettings.dropna(subset=['bathrooms'], inplace=True)

In [281]:
number_baths = []
for i in lettings.bathrooms:
    for f in i:
        number_baths.append(f)

In [282]:
len(number_baths)

1563

In [283]:
lettings['bathrooms'] = number_baths

In [284]:
lettings['bathrooms'] = pd.to_numeric(lettings['bathrooms'])

In [285]:
lettings.head()

Unnamed: 0,Title,info,Rating,reviews,price_per_night,no_amenities,rating,new_ratings,number_of_reviews,accomodation_type,guests,beds,bathrooms
0,The Studio in West Ealing London,Entire guest suite hosted by Tania\n2 guests ·...,4.69 out of 5 stars from 275 reviews\n4.69 (27...,['Jessica\nApril 2020\nA very compact space cl...,£45/ night,Show all 28 amenities,4.69,4.69,275.0,Entire guest suite,2,1,1.0
1,LAST MINUTE - SINGLE ROOM,Private room in house hosted by Jelena\n1 gues...,4.87 out of 5 stars from 164 reviews\n4.87 (16...,['Richard\nMarch 2020\nAll you need for a stay...,£29/ night,Show all 25 amenities,4.87,4.87,164.0,Private room in house,1,1,2.0
2,Homely room in leafy Ealing.,Private room in house hosted by Tina\n1 guest ...,4.75 out of 5 stars from 333 reviews\n4.75 (33...,['Brian\nMarch 2020\nSecond time staying with ...,£29/ night,Show all 17 amenities,4.75,4.75,333.0,Private room in house,1,1,1.0
3,"Small single room, West London, budget travel!",Private room in house hosted by Rashpal\n1 gue...,4.68 out of 5 stars from 28 reviews\n4.68 (28 ...,"[""Czarek\nFebruary 2020\nAnother great stay at...",£21/ night,Show all 26 amenities,4.68,4.68,28.0,Private room in house,1,1,1.0
4,Cosy Double Room,Private room in house hosted by Jesika\n1 gues...,4.78 out of 5 stars from 39 reviews\n4.78 (39 ...,['Dory\nMarch 2020\nPerfect\nDory\nFebruary 20...,£21/ night,Show all 17 amenities,4.78,4.78,39.0,Private room in house,1,1,1.5


In [286]:
lettings.isnull().sum()

Title                0
info                 0
Rating               0
reviews              0
price_per_night      0
no_amenities         0
rating               0
new_ratings          0
number_of_reviews    1
accomodation_type    0
guests               0
beds                 0
bathrooms            0
dtype: int64

In [287]:
lettings.dropna(inplace=True)

In [288]:
lettings.isnull().sum()

Title                0
info                 0
Rating               0
reviews              0
price_per_night      0
no_amenities         0
rating               0
new_ratings          0
number_of_reviews    0
accomodation_type    0
guests               0
beds                 0
bathrooms            0
dtype: int64

In [289]:
lettings.dtypes

Title                 object
info                  object
Rating                object
reviews               object
price_per_night       object
no_amenities          object
rating               float64
new_ratings           object
number_of_reviews    float64
accomodation_type     object
guests                 int64
beds                   int64
bathrooms            float64
dtype: object

### Cleaning the Price column

In [293]:
lettings.head()

Unnamed: 0,Title,info,Rating,reviews,price_per_night,no_amenities,rating,new_ratings,number_of_reviews,accomodation_type,guests,beds,bathrooms
0,The Studio in West Ealing London,Entire guest suite hosted by Tania\n2 guests ·...,4.69 out of 5 stars from 275 reviews\n4.69 (27...,['Jessica\nApril 2020\nA very compact space cl...,£45/ night,Show all 28 amenities,4.69,4.69,275.0,Entire guest suite,2,1,1.0
1,LAST MINUTE - SINGLE ROOM,Private room in house hosted by Jelena\n1 gues...,4.87 out of 5 stars from 164 reviews\n4.87 (16...,['Richard\nMarch 2020\nAll you need for a stay...,£29/ night,Show all 25 amenities,4.87,4.87,164.0,Private room in house,1,1,2.0
2,Homely room in leafy Ealing.,Private room in house hosted by Tina\n1 guest ...,4.75 out of 5 stars from 333 reviews\n4.75 (33...,['Brian\nMarch 2020\nSecond time staying with ...,£29/ night,Show all 17 amenities,4.75,4.75,333.0,Private room in house,1,1,1.0
3,"Small single room, West London, budget travel!",Private room in house hosted by Rashpal\n1 gue...,4.68 out of 5 stars from 28 reviews\n4.68 (28 ...,"[""Czarek\nFebruary 2020\nAnother great stay at...",£21/ night,Show all 26 amenities,4.68,4.68,28.0,Private room in house,1,1,1.0
4,Cosy Double Room,Private room in house hosted by Jesika\n1 gues...,4.78 out of 5 stars from 39 reviews\n4.78 (39 ...,['Dory\nMarch 2020\nPerfect\nDory\nFebruary 20...,£21/ night,Show all 17 amenities,4.78,4.78,39.0,Private room in house,1,1,1.5


In [296]:
lettings['remove_pound'] = lettings['price_per_night'].str.split('£',n=2).str[1]

In [297]:
lettings['price'] = lettings['remove_pound'].str.split('/',n=2).str[0]

In [304]:
lettings.drop('remove_pound', axis=1, inplace=True)

In [318]:
lettings['price'] = pd.to_numeric(lettings['price'])

### Cleaning the number of amenities

In [306]:
amenity_pattern = re.compile('\d+')

In [307]:
number_of_amenities = []
for i in lettings['no_amenities']:
    try:
        number_of_amenities.append(re.findall(amenity_pattern, i))
    except:
        number_of_amenities.append(np.nan)

In [310]:
lettings['amenities'] = number_of_amenities

In [311]:
lettings.amenities = lettings.amenities.apply(lambda y: np.nan if len(y)==0 else y)

In [312]:
lettings.dropna(subset=['amenities'], inplace=True)

In [314]:
num_amen = []
for i in lettings.amenities:
    for f in i:
        num_amen.append(f)

In [315]:
lettings['amenities'] = num_amen

In [319]:
lettings['amenities'] = pd.to_numeric(lettings['amenities'])

In [321]:
lettings.dtypes

Title                 object
info                  object
Rating                object
reviews               object
price_per_night       object
no_amenities          object
rating               float64
new_ratings           object
number_of_reviews    float64
accomodation_type     object
guests                 int64
beds                   int64
bathrooms            float64
price                  int64
amenities              int64
dtype: object

In [327]:
lettings.rename(columns={'Title': 'title'}, inplace=True)

In [140]:
lettings.dropna(inplace=True)

### Removing the \\n in reviews

In [4]:
lettings.head()

Unnamed: 0,title,reviews,rating,number_of_reviews,accomodation_type,guests,beds,bathrooms,price,amenities
0,The Studio in West Ealing London,['Jessica\nApril 2020\nA very compact space cl...,4.69,275.0,Entire guest suite,2,1,1.0,45,28
1,LAST MINUTE - SINGLE ROOM,['Richard\nMarch 2020\nAll you need for a stay...,4.87,164.0,Private room in house,1,1,2.0,29,25
2,Homely room in leafy Ealing.,['Brian\nMarch 2020\nSecond time staying with ...,4.75,333.0,Private room in house,1,1,1.0,29,17
3,"Small single room, West London, budget travel!","[""Czarek\nFebruary 2020\nAnother great stay at...",4.68,28.0,Private room in house,1,1,1.0,21,26
4,Cosy Double Room,['Dory\nMarch 2020\nPerfect\nDory\nFebruary 20...,4.78,39.0,Private room in house,1,1,1.5,21,17


In [5]:
reviews = []
for i in lettings.reviews:
    reviews.append(i.replace('\\n', ' '))

In [7]:
lettings['reviews'] = reviews

### clean lettings saved to a new csv

In [142]:
lettings.to_csv('lettings_with_price4', index=False)

In [27]:
lettings = pd.read_csv('lettings_with_price4')

I will now take only the relevant columns, and save them to a new file, which I will use from now onwards.

In [28]:
lettings = lettings[['title','reviews','rating','number_of_reviews','accomodation_type',
                      'guests','beds','bathrooms','price','amenities']]

In [10]:
lettings.to_csv('clean_lettings_with_price', index=False)