In [133]:
%reset

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re

from bs4 import BeautifulSoup as bs
from itertools import islice #to start iteration from a different line number

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [134]:
#Initialize variables
ID = []
location = []
price = []
property_type = []
area = []
pool = []
garage = []
df = []

#gather data from these files
#taking a file for each month from August - October 2018
files = ['20180806_prop_for_sale_tom_classifieds.html','20180905_prop_for_sale_tom_classifieds.html','20181008_prop_for_sale_tom_classifieds.html']

#open the files of interest
for i in range(0,3):
    print('Scanning file ' + str(i) + ': ' + files[i])
    with open('./data/raw/'+files[i]) as fp:
        #read the html file using Beautiful Soup
        soup = bs(fp)
        #extract all the property listings in that file
        #uls = soup.find_all('ul',atrrs={'class':'classified_list'})
        listings = soup.find_all('li')
        #For every list tag:
        for tag in listings:
            #Skip the tags that do not contain Property
            if not ('Property For Sale' in tag.text):
                continue
            
            text = tag.text.lower()
            
            #Get ID
            try:
                temp = tag['name']
            except KeyError:
                #print('Error:')
                #print(text)
                continue
            ID.append(temp)
            
            #Get Location
            temp = re.search('\n(.+?(?=\.))\.',text)
            if temp is None or len(temp.group(1)) > 25:
                location.append('NA')
            else:
                location.append(temp.group(1))
                
            #Get Price
            temp = re.search('\.(.*\d+\,\d+)\.',text)#.group(1)
            if temp is None:
                price.append(-1) #indicates missing data point
            elif temp.group(1) is None:
                price.append(-1)
            else:
                temp = re.search('([\d+|\,]+$)',temp.group(1))
                price.append(int(temp.group(1).replace(',','')))
                
            #Get Property Type
            types = ['house','penthouse','maisonette','apartment','flat',
                    'farmhouse','villa','house of character','block']
            if any(j in text for j in types):
                for _type in types:
                    if _type in text:
                        property_type.append(_type)
                        break
            else:
                property_type.append('undefined')
                
            #Get Property Area
            temp = re.search('(\d+)sqm',text)
            if temp is None:
                area.append(-1)
            else:
                area.append(int(temp.group(1)))
            
            #Get Pool Data
            if 'pool' in text:
                pool.append(1)
            else:
                pool.append(0)
            
            #Get Garage Data
            if 'garage optional' in text:
                garage.append(2)
            elif 'garage' in text:
                garage.append(1)
            else:
                garage.append(0)
        #Save to dataframe
        #print(len(ID),len(location),len(price),len(property_type),len(area),len(garage),len(pool))
        df.append(pd.DataFrame({'ID':ID, 'location':location,
                              'price':price,'type':property_type,
                              'area':area,'pool':pool,
                              'garage':garage}))
        
            

Scanning file 0: 20180806_prop_for_sale_tom_classifieds.html
Scanning file 1: 20180905_prop_for_sale_tom_classifieds.html
Scanning file 2: 20181008_prop_for_sale_tom_classifieds.html


In [135]:
data = pd.concat([df[0], df[1], df[2]],sort=True)
data

Unnamed: 0,ID,area,garage,location,pool,price,type
0,1245018,-1,0,madliena,0,470000,apartment
1,1245019,-1,0,sliema,0,375000,house
2,1244500,-1,0,,0,-1,undefined
3,1244501,180,0,,0,495000,house
4,1244502,192,0,attard,0,350000,apartment
5,1244503,210,0,attard,1,500000,villa
6,1244504,-1,0,attard,1,-1,flat
7,1244505,-1,0,attard,0,-1,villa
8,1244506,-1,0,attard,0,425000,maisonette
9,1244507,-1,0,attard,0,239000,maisonette


In [136]:
#Fix the Categories:
data.loc[data.location == 'attard, misraħ kola', 'location'] = 'attard'
data.loc[data.location == 'bahar iċ-ċagħaq', 'location'] = 'baħar iċ-ċagħaq'
data.loc[data.location == 'baħrija ', 'location'] = 'baħrija'
data.loc[data.location == 'birkirkara, mrieħel', 'location'] = 'birkirkara'
data.loc[data.location == 'birkirkara, ta\' paris', 'location'] = 'birkirkara'
data.loc[data.location == 'birkirkara, villa area', 'location'] = 'birkirkara'
data.loc[data.location == 'birzebuġġia', 'location'] = 'birżebbuġa'
data.loc[data.location == 'birżebbuġa, qajjenza', 'location'] = 'birżebbuġa'
data.loc[data.location == 'birżebbuġa, qajjenza area', 'location'] = 'birżebbuġa'
data.loc[data.location == 'buġibba (square area)', 'location'] = 'buġibba'
data.loc[data.location == 'cospicua (bormla)', 'location'] = 'cospicua'
data.loc[data.location == 'duplex maisonette', 'location'] = 'NA'
data.loc[data.location == 'gozo, ghajnsielem', 'location'] = 'gozo, għajnsielem'
data.loc[data.location == 'gozo, għajnsielem / qala', 'location'] = 'gozo, għajnsielem'
data.loc[data.location == 'gozo, marsalforn, qbajjar', 'location'] = 'gozo, marsalforn'
data.loc[data.location == 'gozo, nadur the crystal', 'location'] = 'gozo, nadur'
data.loc[data.location == 'gozo qala', 'location'] = 'gozo, qala'
data.loc[data.location == 'gozo, san lawrence', 'location'] = 'gozo, san lawrenz'
data.loc[data.location == 'gozo, victoria, outskirts', 'location'] = 'gozo, victoria'
data.loc[data.location == 'guardamangia', 'location'] = 'gwardamangia'
data.loc[data.location == 'għarghur', 'location'] = 'għargħur'
data.loc[data.location == 'gżira / sliema, border', 'location'] = 'gżira'
data.loc[data.location == 'gżira, near national pool', 'location'] = 'gżira'
data.loc[data.location == 'gżira, off the promenade', 'location'] = 'gżira'
data.loc[data.location == 'gżira,close to seafront', 'location'] = 'gżira'
data.loc[data.location == 'lija, tal-mirakli', 'location'] = 'lija'
data.loc[data.location == 'manikata, rural village', 'location'] = 'manikata'
data.loc[data.location == 'mellieħa, (near ghadira)', 'location'] = 'mellieħa'
data.loc[data.location == 'mellieħa bay', 'location'] = 'mellieħa'
data.loc[data.location == 'mellieħa, kortin area', 'location'] = 'mellieħa'
data.loc[data.location == 'mosta (speranza)', 'location'] = 'mosta'
data.loc[data.location == 'mosta, żokrija area', 'location'] = 'mosta'
data.loc[data.location == 'msida circus', 'location'] = 'msida'
data.loc[data.location == 'msida, circus', 'location'] = 'msida'
data.loc[data.location == 'msida, msida circus', 'location'] = 'msida'
data.loc[data.location == 'msida, rue d\' argens', 'location'] = 'msida'
data.loc[data.location == 'msida, university heights', 'location'] = 'msida'
data.loc[data.location == 'naxxar, village core', 'location'] = 'naxxar'
data.loc[data.location == 'pendergardens', 'location'] = 'pender gardens'
data.loc[data.location == 'pietà/ ħamrun', 'location'] = 'pietà'
data.loc[data.location == 'qajjenza, birżebbuġa', 'location'] = 'birżebbuġa'
data.loc[data.location == 'qawra, fra ben', 'location'] = 'qawra'
data.loc[data.location == 'qawra point', 'location'] = 'qawra'
data.loc[data.location == 'qormi, limits of luqa', 'location'] = 'qormi'
data.loc[data.location == 'qormi, tal-ħandaq', 'location'] = 'qormi'
data.loc[data.location == 'qormi, mrieħel', 'location'] = 'qormi'
data.loc[data.location == 'santa lucija', 'location'] = 'santa luċija'
data.loc[data.location == 'senglea (l-isla)', 'location'] = 'senglea'
data.loc[data.location == 'sliema (dingli street)', 'location'] = 'sliema'
data.loc[data.location == 'sliema (tower road)', 'location'] = 'sliema'
data.loc[data.location == 'sliema, best part', 'location'] = 'sliema'
data.loc[data.location == 'sliema, capua', 'location'] = 'sliema'
data.loc[data.location == 'sliema, fond ghadir', 'location'] = 'sliema'
data.loc[data.location == 'sliema, fond għadir', 'location'] = 'sliema'
data.loc[data.location == 'sliema, fort cambridge', 'location'] = 'sliema'
data.loc[data.location == 'sliema, qui-si-sana', 'location'] = 'sliema'
data.loc[data.location == 'sliema, sunny side', 'location'] = 'sliema'
data.loc[data.location == 'sliema, tigné', 'location'] = 'sliema'
data.loc[data.location == 'sliema, tigné area', 'location'] = 'sliema'
data.loc[data.location == 'st julian\' s', 'location'] = 'st julians'
data.loc[data.location == 'st julians', 'location'] = 'st julians'
data.loc[data.location == 'st julians, balluta', 'location'] = 'st julians'
data.loc[data.location == 'st julians, balluta bay', 'location'] = 'st julians'
data.loc[data.location == 'st julians, portomaso', 'location'] = 'st julians'
data.loc[data.location == 'st julians, spinola', 'location'] = 'st julians'
data.loc[data.location == 'st julians, spinola bay', 'location'] = 'st julians'
data.loc[data.location == 'st julians, ta\' giorni', 'location'] = 'st julians'
data.loc[data.location == 'st julians, the podium', 'location'] = 'st julians'
data.loc[data.location == 'st julians, the village', 'location'] = 'st julians'
data.loc[data.location == 'st paul\' s', 'location'] = 'san pawl il-baħar'
data.loc[data.location == 'st paul\' s bay', 'location'] = 'san pawl il-baħar'
data.loc[data.location == 'swatar area', 'location'] = 'swatar'
data.loc[data.location == 'ta\' xbiex, seafront', 'location'] = 'ta\' xbiex'
data.loc[data.location == 'tal-ibraġ, church area', 'location'] = 'tal-ibraġ'
data.loc[data.location == 'tal-ibraġ, st andrews', 'location'] = 'tal-ibraġ'
data.loc[data.location == 'valletta / floriana', 'location'] = 'valletta'
data.loc[data.location == 'valletta, upper', 'location'] = 'valletta'
data.loc[data.location == 'vittoriosa (birgu)', 'location'] = 'vittoriosa'
data.loc[data.location == 'xemxija heights', 'location'] = 'xemxija'
data.loc[data.location == 'ħamrun / st venera', 'location'] = 'ħamrun'
data.loc[data.location == 'ħamrun, central area', 'location'] = 'ħamrun'
data.loc[data.location == 'żabbar, outskirts', 'location'] = 'żabbar'

In [131]:
data['location'] = data['location'].astype('category')
data['location'].cat.categories

Index(['NA', 'attard', 'balluta bay', 'balzan', 'baħar iċ-ċagħaq', 'baħrija',
       'bidnija', 'birguma', 'birkirkara', 'birżebbuġa', 'blata l-bajda',
       'buskett', 'buġibba', 'cospicua', 'dingli', 'fgura', 'floriana', 'gozo',
       'gozo, fontana', 'gozo, għajnsielem', 'gozo, għarb', 'gozo, għasri',
       'gozo, kerċem', 'gozo, marsalforn', 'gozo, munxar', 'gozo, nadur',
       'gozo, qala', 'gozo, san lawrenz', 'gozo, sannat', 'gozo, victoria',
       'gozo, xagħra', 'gozo, xewkija', 'gozo, xlendi', 'gozo, żebbuġ',
       'gudja', 'gwardamangia', 'għajn tuffieħa', 'għargħur', 'għaxaq',
       'gżira', 'iklin', 'kalkara', 'kappara', 'kirkop', 'lija', 'luqa',
       'madliena', 'manikata', 'marsa', 'marsascala', 'marsaxlokk', 'mellieħa',
       'mensija', 'monterosa garden' s', 'mosta', 'mqabba', 'mrieħel', 'msida',
       'mġarr', 'naxxar', 'paola', 'pembroke', 'pender gardens', 'pietà',
       'portomaso', 'qawra', 'qormi', 'qrendi', 'rabat', 'safi',
       'san pawl il-baħar'

In [137]:
#remove duplicates:
print(len(data))
data = data.drop_duplicates()
print(len(data))

2765
1422


In [138]:
#Change the -1s to Nan
data = data.replace(-1,np.NaN)