# Importing and preparing rental apartments data

## Libraries and settings

In [1]:
# Libraries
import os
import re
import time
import fnmatch
import numpy as np
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Get current working directory
print(os.getcwd())

/workspaces/data_analytics/Week_02


## Importing data

In [2]:
# Show .csv - files in the directory
flist = fnmatch.filter(os.listdir('.'), '*.csv')
for i in flist:
    print(i)

# Read the data to a pandas data frame
df = pd.read_csv('apartments_data_zuerich.csv', sep=',', encoding='utf-8')

# Show first records of data frame
df.head()

apartments_data_winterthur.csv
apartments_data_prepared.csv
apartments_data_zuerich.csv


Unnamed: 0,web-scraper-order,web-scraper-start-url,rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw
0,1693998201-1,https://www.immoscout24.ch/de/immobilien/miete...,"3 Zimmer, 49 m², CHF 1441.—","Neuhusstrasse 6, 8630 Rüti ZH, ZH",CHF 1441.—,«Gemütliche Wohnung im Grünen»,"3 Zimmer, 49 m², CHF 1441.—Neuhusstrasse 6, 86..."
1,1693998201-2,https://www.immoscout24.ch/de/immobilien/miete...,"3,5 Zimmer, 65 m², CHF 1850.—","Zürcherstrasse 1, 8173 Neerach, ZH",CHF 1850.—,«Attraktive 3.5-Zimmer-EG-Wohnung in Neerach»,"3,5 Zimmer, 65 m², CHF 1850.—Zürcherstrasse 1,..."
2,1693998201-3,https://www.immoscout24.ch/de/immobilien/miete...,"19 m², CHF 2686.—","Cramerstrasse 8-12, 8004 Zürich, ZH",CHF 2686.—,«Studio Apartment Junior Balcony»,"19 m², CHF 2686.—Cramerstrasse 8-12, 8004 Züri..."
3,1693998201-4,https://www.immoscout24.ch/de/immobilien/miete...,"2 Zimmer, 54 m², CHF 4853.—","Cramerstrasse 8-12, 8004 Zürich, ZH",CHF 4853.—,«2 Bedroom Apartment Senior Balcony»,"2 Zimmer, 54 m², CHF 4853.—Cramerstrasse 8-12,..."
4,1693998201-5,https://www.immoscout24.ch/de/immobilien/miete...,"2 Zimmer, 49 m², CHF 4335.—","Rotachstrasse 33, 8003 Zürich, ZH",CHF 4335.—,«2 Bedroom Apartment Junior Terrace»,"2 Zimmer, 49 m², CHF 4335.—Rotachstrasse 33, 8..."


## Count number of rows and columns in the data frame

In [3]:
# Dimension (rows, columns)
print('Dimension:', df.shape)

# Number of rows
print('Number of rows:', df.shape[0])

# Number of columns
print('Number of columns:', df.shape[1])

Dimension: (1008, 7)
Number of rows: 1008
Number of columns: 7


## Get data types (raw-format from web scraping)

In [4]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

web-scraper-order        object
web-scraper-start-url    object
rooms_area_price_raw     object
address_raw              object
price_raw                object
description_raw          object
text_raw                 object
dtype: object

## Extract and save relevant information from raw data using regular expressions (regex)

### Extract number of rooms

In [5]:
# Extract values from 'rooms_area_price_raw' strings
rooms = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall('(.*)Zimmer', i)
    try:
        d2 = d1[0].strip().replace(',', '.')
    except:
        d2 = None
    rooms.append(d2)

# Save as new variable in the pandas data frame
df['rooms'] = pd.Series(rooms, dtype="float64")
    
# Print first 5 values
print(df['rooms_area_price_raw'].head(5), '\n')
print(df['rooms'].head(5))

0      3 Zimmer, 49 m², CHF 1441.—
1    3,5 Zimmer, 65 m², CHF 1850.—
2                19 m², CHF 2686.—
3      2 Zimmer, 54 m², CHF 4853.—
4      2 Zimmer, 49 m², CHF 4335.—
Name: rooms_area_price_raw, dtype: object 

0    3.0
1    3.5
2    NaN
3    2.0
4    2.0
Name: rooms, dtype: float64


### Extract living area

In [6]:
# Extract values from 'rooms_area_price_raw' strings
area = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall('Zimmer, (.*)m²', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    area.append(d2)

# Save as new variable in the pandas data frame
df['area'] = pd.Series(area, dtype="Int64")

# Print first 5 values
print(df['rooms_area_price_raw'].head(5), '\n')
print(df['area'].head(5))

0      3 Zimmer, 49 m², CHF 1441.—
1    3,5 Zimmer, 65 m², CHF 1850.—
2                19 m², CHF 2686.—
3      2 Zimmer, 54 m², CHF 4853.—
4      2 Zimmer, 49 m², CHF 4335.—
Name: rooms_area_price_raw, dtype: object 

0      49
1      65
2    <NA>
3      54
4      49
Name: area, dtype: Int64


### Extract rental price

In [7]:
# Extract values from 'price_raw' strings
price = []
for i in df['price_raw']:
    d1 = re.findall('[0-9]+', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    price.append(d2)

# Save as new variable in the pandas data frame
df['price'] = pd.Series(price, dtype="Int64")

# Print first 5 values
print(df['price_raw'].head(5), '\n')
print(df['price'].head(5))

0    CHF 1441.—
1    CHF 1850.—
2    CHF 2686.—
3    CHF 4853.—
4    CHF 4335.—
Name: price_raw, dtype: object 

0    1441
1    1850
2    2686
3    4853
4    4335
Name: price, dtype: Int64


### Get data types of all variables including the new ones

In [8]:
df.dtypes

web-scraper-order         object
web-scraper-start-url     object
rooms_area_price_raw      object
address_raw               object
price_raw                 object
description_raw           object
text_raw                  object
rooms                    float64
area                       Int64
price                      Int64
dtype: object

## Count and identify missing values (if any)

In [9]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values
df[df.isna().any(axis=1)].head()

web-scraper-order          0
web-scraper-start-url      0
rooms_area_price_raw       0
address_raw                0
price_raw                  0
description_raw            0
text_raw                   0
rooms                     45
area                     135
price                     15
dtype: int64


Unnamed: 0,web-scraper-order,web-scraper-start-url,rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw,rooms,area,price
2,1693998201-3,https://www.immoscout24.ch/de/immobilien/miete...,"19 m², CHF 2686.—","Cramerstrasse 8-12, 8004 Zürich, ZH",CHF 2686.—,«Studio Apartment Junior Balcony»,"19 m², CHF 2686.—Cramerstrasse 8-12, 8004 Züri...",,,2686
5,1693998201-6,https://www.immoscout24.ch/de/immobilien/miete...,"34 m², CHF 3205.—","Binzmühlestr. 50, 8050 Zürich, ZH",CHF 3205.—,«Studio Apartment Senior Terrace»,"34 m², CHF 3205.—Binzmühlestr. 50, 8050 Zürich...",,,3205
6,1693998201-7,https://www.immoscout24.ch/de/immobilien/miete...,"25 m², CHF 3308.—","Rotachstrasse 33, 8003 Zürich, ZH",CHF 3308.—,«1 Bedroom Apartment Junior Terrace»,"25 m², CHF 3308.—Rotachstrasse 33, 8003 Zürich...",,,3308
7,1693998201-8,https://www.immoscout24.ch/de/immobilien/miete...,"43 m², CHF 4241.—","Militärstrasse 24, 8004 Zürich, ZH",CHF 4241.—,«2 Bedroom Apartment Junior»,"43 m², CHF 4241.—Militärstrasse 24, 8004 Züric...",,,4241
8,1693998201-9,https://www.immoscout24.ch/de/immobilien/miete...,"19 m², CHF 2582.—","Cramerstrasse 8-12, 8004 Zürich, ZH",CHF 2582.—,«Studio Apartment Junior»,"19 m², CHF 2582.—Cramerstrasse 8-12, 8004 Züri...",,,2582


## Count and identify duplicated values (if any)

In [10]:
# Count duplicated values
print(df.duplicated().sum())

# Identify rows with duplicated values, e.g.:
df[df[['web-scraper-order', 'price_raw', 'address_raw']].duplicated()]

0


Unnamed: 0,web-scraper-order,web-scraper-start-url,rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw,rooms,area,price


### Save data to file

In [11]:
df.to_csv('apartments_data_prepared.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [12]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.8.0-1014-azure
Datetime: 2024-09-25 13:19:26
Python Version: 3.11.10
-----------------------------------
