In [1]:
# load packages
import os # establish the interaction between the user and the operating system
import glob # allows for Unix-style pathname pattern 

import re #check whether a given string matches a given pattern
from bs4 import BeautifulSoup # pull data out of HTML and XML files

import pandas as pd #pulling data out of HTML and XML files
#check currect directory
path = os.getcwd()
print ("The current working directory is %s" % path)
#reading files
indices = glob.glob('./data/accom/*.html')
len(indices)
#loading the data from local
local_files = list()
for fname in glob.glob('./data/accom/*.html'):
    local_files.append(fname.split('/')[-1])
# Please, use the following list as a column name
column = ["Region", "Price", "Type", "Size", "Floor", "Construction", "Condition", "Elevator", "Longitude", "Latitude"]
#creating dictionary of lists with pre-specified columns/keys
accom = dict()
for col in column:
    accom[col] = list()
for fname in indices:
    with open(fname) as f:
        content = f.read()
    soup = BeautifulSoup(content, 'html.parser')
    page_content = soup.find('div', {'class': 'container'})
    #getting the price
    price = page_content.select_one('.basic-info .profile_price').text
    price = price.split(" ")[0].strip()
    accom["Price"].append(price)
    # get all info on space
    about_space = page_content.select('.about_space_basic > *')
    for info in about_space:
    # get the region
        if "Region" in info.get_text():
            region_text = info.find('p').get_text().split(" ")[-1]
            region = region_text.strip('.').strip(',')
            accom["Region"].append(region)
    # get the apartment type
        if "Apartment type" in info.get_text():
            type_text = info.find('p').get_text()
            type = type_text.strip('.').strip(',')
            accom["Type"].append(type)
    # get the apartment size
        if "Square meters" in info.get_text():
            area_text = info.find('p').get_text()
            area = re.search(r'(\d+(\.\d+)?)', area_text).group(1)
            accom["Size"].append(area)
    # get the apartment floor
        if "floor" in info.get_text():
            floor_text = info.find('p').get_text()
            floor = re.search(r'(\d+(\.\d+)?)', floor_text).group(1)
            accom["Floor"].append(floor)
    # get the apartment construction year
        if "Construction year" in info.get_text():
            year_text = info.find('p').get_text()
            year = re.search(r'(\d{4})', year_text).group(1)
            accom["Construction"].append(year)
    # get the apartment status
        if "Status:" in info.get_text():
            status_text = info.find('p').get_text()
            condition_word = re.search(r'\b(\w+)\s+condition\b', status_text).group(1)
            accom["Condition"].append(condition_word)

    # get the availability of the elevator
    offers_section = page_content.find('div', class_='offers')
    if offers_section:
        offers_table = offers_section.find('table', class_='offers')
        if offers_table:
            for row in offers_table.find_all('tr'):
                cells = row.find_all('td')
                for cell in cells:
                    if "not" in cell.get_text().lower():
                        accom["Elevator"].append("No")
                        break
                else:
                    continue
                break
            else:
                # If no "not" is found, assume Yes
                accom["Elevator"].append("Yes")
        else:
            # If there's no table, assume no elevator
            accom["Elevator"].append("No")
    else:
        # If there's no offers section, assume no elevator
        accom["Elevator"].append("No")

    # TODO: get the longitude
    map_section = page_content.find('div', class_='map')
    if map_section:
        # Find the paragraph containing the longitude and latitude information
        map_info = map_section.find('p')
        if map_info:
            # Extract longitude and latitude from the text
            coordinates_text = map_info.get_text()
            longitude_match = re.search(r'longitude of ([-+]?\d*\.\d+)', coordinates_text)
            if longitude_match:
                longitude = longitude_match.group(1)
                accom["Longitude"].append(longitude)
            
            latitude_match = re.search(r'latitude of ([-+]?\d*\.\d+)', coordinates_text)
            if latitude_match:
                latitude = latitude_match.group(1)
                accom["Latitude"].append(latitude)
    
#constructing a DataFrame from a dict
df = pd.DataFrame.from_dict(accom)

The current working directory is c:\Users\pinsk\Documents\yliopisto\Joda\viikko4


Access

In [2]:
#loading pickle file and getting the first five rows
df.head()

Unnamed: 0,Region,Price,Type,Size,Floor,Construction,Condition,Elevator,Longitude,Latitude
0,Niemenranta,â‚¬300,Two rooms,50.0,2,2020,good,Yes,23.69660557450159,61.52426921143939
1,Leinola,â‚¬255,Three rooms,80.0,0,1975,unknown,No,23.910149658010493,61.48946062565256
2,HÃ¤rmÃ¤lÃ¤nranta,â‚¬360,Two rooms,54.0,2,2022,unknown,No,23.72300425566757,61.47551363767057
3,Amuri,â‚¬432,Two rooms,48.0,1,2023,unknown,Yes,23.741643263744766,61.49968017394604
4,Kissanmaa,â‚¬153,Studio apartment,28.0,3,1959,good,No,23.82277363783996,61.50021477921354


In [3]:
#loading pickle file and getting the last five rows
df.tail()

Unnamed: 0,Region,Price,Type,Size,Floor,Construction,Condition,Elevator,Longitude,Latitude
1075,HÃ¤rmÃ¤lÃ¤nranta,â‚¬330,Two rooms,54.0,7,2018,good,Yes,23.727866061687656,61.47573308469892
1076,Amuri,â‚¬324,Two rooms,37.0,3,2023,unknown,Yes,23.74267272233312,61.49705474306858
1077,RantaperkiÃ¶,â‚¬372,Three rooms,86.5,1,2006,good,No,23.75351047076308,61.47293778780631
1078,Keskusta,â‚¬225,Studio apartment,48.0,4,1929,satisfactory,No,24.058603556628885,61.46460833687141
1079,Ratina,â‚¬396,Two rooms,44.0,5,2011,good,Yes,23.767109944707475,61.495878988285774


In [4]:
# gettign a summary of the data set
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1080 entries, 0 to 1079
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Region        1080 non-null   object
 1   Price         1080 non-null   object
 2   Type          1080 non-null   object
 3   Size          1080 non-null   object
 4   Floor         1080 non-null   object
 5   Construction  1080 non-null   object
 6   Condition     1080 non-null   object
 7   Elevator      1080 non-null   object
 8   Longitude     1080 non-null   object
 9   Latitude      1080 non-null   object
dtypes: object(10)
memory usage: 42.2+ KB


In [5]:
#check null values in Price feature
df[df['Price'].isnull()]

Unnamed: 0,Region,Price,Type,Size,Floor,Construction,Condition,Elevator,Longitude,Latitude


In [6]:
#check duplicated values
df[df.duplicated()]

Unnamed: 0,Region,Price,Type,Size,Floor,Construction,Condition,Elevator,Longitude,Latitude


In [7]:
#gettign a summary statistics of the construction feature
df[["Construction"]].describe()

Unnamed: 0,Construction
count,1080
unique,99
top,2022
freq,92


In [8]:
#gettign a series of unique values of apartment Type in feature
df["Type"].value_counts()

Type
Two rooms             463
Three rooms           262
Studio apartment      204
Four rooms or more    151
Name: count, dtype: int64

Clean

In [9]:
#Make a copy of this object’s indices and data.
df_clean = df.copy()
#change the data format

#changing the data format of floor feature
df_clean["Floor"] = pd.to_numeric(df_clean["Floor"])
#change the data format of Size feature
df_clean["Size"] = pd.to_numeric(df_clean["Size"])
# TODO: change the data format of Longitude feature
df_clean["Longitude"] = pd.to_numeric(df_clean["Longitude"])
# TODO: change the data format of Latitude feature
df_clean["Latitude"] = pd.to_numeric(df_clean["Latitude"])
# TODO: change the data format of Construction feature
df_clean["Construction"] = pd.to_numeric(df_clean["Construction"])
#check changes
df_clean.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1080 entries, 0 to 1079
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Region        1080 non-null   object 
 1   Price         1080 non-null   object 
 2   Type          1080 non-null   object 
 3   Size          1080 non-null   float64
 4   Floor         1080 non-null   int64  
 5   Construction  1080 non-null   int64  
 6   Condition     1080 non-null   object 
 7   Elevator      1080 non-null   object 
 8   Longitude     1080 non-null   float64
 9   Latitude      1080 non-null   float64
dtypes: float64(3), int64(2), object(5)
memory usage: 63.3+ KB


In [10]:
#save and load latest changes
df_clean.to_pickle("TampereBNB.pkl")
unpickled_df = pd.read_pickle("TampereBNB.pkl")
unpickled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1080 entries, 0 to 1079
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Region        1080 non-null   object 
 1   Price         1080 non-null   object 
 2   Type          1080 non-null   object 
 3   Size          1080 non-null   float64
 4   Floor         1080 non-null   int64  
 5   Construction  1080 non-null   int64  
 6   Condition     1080 non-null   object 
 7   Elevator      1080 non-null   object 
 8   Longitude     1080 non-null   float64
 9   Latitude      1080 non-null   float64
dtypes: float64(3), int64(2), object(5)
memory usage: 63.3+ KB
