# Importing and preparing rental apartments data

## Libraries and settings

In [1]:
# Libraries
import os
import re
import time
import fnmatch
import numpy as np
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

## Importing data

In [2]:
# Get current working directory
print(os.getcwd())

# Show all files in the directory
flist = fnmatch.filter(os.listdir('.'), '*.csv')
for i in flist:
    print(i)

# Read the data to a pandas data frame
df = pd.read_csv('apartments_data_zuerich.csv', sep=',', encoding='utf-8')

/Users/miroduman/Desktop/data analytics/Woche_2
supermarkets_data_prepared.csv
apartments_data_adliswil.csv
apartments_data_prepared.csv
apartments_data_preparedA.csv
apartments_data_zuerich.csv


## Count number of rows and columns in the data frame

In [3]:
# Dimension (rows, columns)
print('Dimension:', df.shape)

# Number of rows
print('Number of rows:', df.shape[0])

# Number of columns
print('Number of columns:', df.shape[1])

Dimension: (1008, 7)
Number of rows: 1008
Number of columns: 7


## Get data types (raw-format from web scraping)

In [4]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

web-scraper-order        object
web-scraper-start-url    object
rooms_area_price_raw     object
address_raw              object
price_raw                object
description_raw          object
text_raw                 object
dtype: object

## Extract and save relevant information from raw data using regular expressions (regex)

### Extract number of rooms

In [5]:
# Extract values from 'rooms_area_price_raw' strings
rooms = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall('(.*)Zimmer', i)
    try:
        d2 = d1[0].strip().replace(',', '.')
    except:
        d2 = None
    rooms.append(d2)

# Save as new variable in the pandas data frame
df['rooms'] = pd.Series(rooms, dtype="float64")
    
# Print first 5 values
print(df['rooms_area_price_raw'].head(5), '\n')
print(df['rooms'].head(5))

0    3,5 Zimmer, 122 m², CHF 3180.—
1     2,5 Zimmer, 78 m², CHF 3760.—
2    5,5 Zimmer, 115 m², CHF 2860.—
3     3,5 Zimmer, 74 m², CHF 2165.—
4    5,5 Zimmer, 195 m², CHF 6900.—
Name: rooms_area_price_raw, dtype: object 

0    3.5
1    2.5
2    5.5
3    3.5
4    5.5
Name: rooms, dtype: float64


### Extract living area

In [6]:
# Extract values from 'rooms_area_price_raw' strings
area = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall('Zimmer, (.*)m²', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    area.append(d2)

# Save as new variable in the pandas data frame
df['area'] = pd.Series(area, dtype="Int64")

# Print first 5 values
print(df['rooms_area_price_raw'].head(5), '\n')
print(df['area'].head(5))

0    3,5 Zimmer, 122 m², CHF 3180.—
1     2,5 Zimmer, 78 m², CHF 3760.—
2    5,5 Zimmer, 115 m², CHF 2860.—
3     3,5 Zimmer, 74 m², CHF 2165.—
4    5,5 Zimmer, 195 m², CHF 6900.—
Name: rooms_area_price_raw, dtype: object 

0    122
1     78
2    115
3     74
4    195
Name: area, dtype: Int64


### Extract rental price

In [7]:
# Extract values from 'price_raw' strings
price = []
for i in df['price_raw']:
    d1 = re.findall('[0-9]+', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    price.append(d2)

# Save as new variable in the pandas data frame
df['price'] = pd.Series(price, dtype="Int64")

# Print first 5 values
print(df['price_raw'].head(5), '\n')
print(df['price'].head(5))

0    CHF 3180.—
1    CHF 3760.—
2    CHF 2860.—
3    CHF 2165.—
4    CHF 6900.—
Name: price_raw, dtype: object 

0    3180
1    3760
2    2860
3    2165
4    6900
Name: price, dtype: Int64


### Get data types of all variables including the new ones

In [8]:
df.dtypes

web-scraper-order         object
web-scraper-start-url     object
rooms_area_price_raw      object
address_raw               object
price_raw                 object
description_raw           object
text_raw                  object
rooms                    float64
area                       Int64
price                      Int64
dtype: object

## Count and identify missing values (if any)

In [9]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values
df[df.isna().any(axis=1)]

web-scraper-order         0
web-scraper-start-url     0
rooms_area_price_raw      0
address_raw               0
price_raw                 0
description_raw           0
text_raw                  0
rooms                    10
area                     86
price                     8
dtype: int64


Unnamed: 0,web-scraper-order,web-scraper-start-url,rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw,rooms,area,price
36,1662023723-650,https://www.immoscout24.ch/de/wohnung/mieten/k...,"5 Zimmer, CHF 2495.—","Grossackerstrasse 102, 8041 Zürich, ZH",CHF 2495.—,«5-Zimmer-Wohnung im Grünen»,"5 Zimmer, CHF 2495.—Grossackerstrasse 102, 804...",5.0,,2495
45,1662023711-562,https://www.immoscout24.ch/de/wohnung/mieten/k...,"3 Zimmer, CHF 1250.—","Trüllergasse 15, 8245 Feuerthalen, ZH",CHF 1250.—,«Wohnen am Rhein»,"3 Zimmer, CHF 1250.—Trüllergasse 15, 8245 Feue...",3.0,,1250
50,1662023701-502,https://www.immoscout24.ch/de/wohnung/mieten/k...,"3 Zimmer, CHF 1850.—","Zürcherstrasse 71, 8102 Oberengstringen, ZH",CHF 1850.—,«Sanierte Wohnung am Stadtrand von Zürich»,"3 Zimmer, CHF 1850.—Zürcherstrasse 71, 8102 Ob...",3.0,,1850
58,1662023711-557,https://www.immoscout24.ch/de/wohnung/mieten/k...,"2 Zimmer, CHF 1700.—","8113 Boppelsen, ZH",CHF 1700.—,«GROSSZÜGIG MIT GARTENSITZPLATZ»,"2 Zimmer, CHF 1700.—8113 Boppelsen, ZH«GROSSZÜ...",2.0,,1700
67,1662023752-877,https://www.immoscout24.ch/de/wohnung/mieten/k...,CHF 1500.—,"Flurstrasse 12/14, 8302 Kloten, ZH",CHF 1500.—,«Ihr Business Apartment möbliert und per sofor...,"CHF 1500.—Flurstrasse 12/14, 8302 Kloten, ZH«I...",,,1500
...,...,...,...,...,...,...,...,...,...,...
950,1662023742-794,https://www.immoscout24.ch/de/wohnung/mieten/k...,"5,5 Zimmer, CHF 2100.—","8632 Tann, ZH",CHF 2100.—,"«New price! LETZTE WOHNUNG: HELL, RENOVIERT, Z...","5,5 Zimmer, CHF 2100.—8632 Tann, ZH«New price!...",5.5,,2100
952,1662023780-1098,https://www.immoscout24.ch/de/wohnung/mieten/k...,"4,5 Zimmer, CHF 1900.—","Gertrudstrasse 7, 8953 Dietikon, ZH",CHF 1900.—,«Gemütliche 4.5-Zimmer Wohnung»,"4,5 Zimmer, CHF 1900.—Gertrudstrasse 7, 8953 D...",4.5,,1900
958,1662023727-696,https://www.immoscout24.ch/de/wohnung/mieten/k...,"2,5 Zimmer, 67 m², Preis auf Anfrage","Hagenholzstrasse 55, 8050 Zürich, ZH",Preis auf Anfrage,«Exklusive Apartments im Tower55 - Oerlikon»,"2,5 Zimmer, 67 m², Preis auf AnfrageHagenholzs...",2.5,67,
979,1662023723-649,https://www.immoscout24.ch/de/wohnung/mieten/k...,"3,5 Zimmer, 86 m², Preis auf Anfrage","Hagenholzstrasse 55, 8050 Zürich, ZH",Preis auf Anfrage,«Ihr neuer Wohntraum im Tower55 - Oerlikon»,"3,5 Zimmer, 86 m², Preis auf AnfrageHagenholzs...",3.5,86,


## Count and identify duplicated values (if any)

In [10]:
# Count duplicated values
print(df.duplicated().sum())

# Identify rows with duplicated values, e.g.:
df[df[['web-scraper-order', 'price_raw', 'address_raw']].duplicated()]

0


Unnamed: 0,web-scraper-order,web-scraper-start-url,rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw,rooms,area,price


### Save data to file

In [11]:
df.to_csv('apartments_data_prepared.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [12]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Darwin | 21.6.0
Datetime: 2022-09-30 16:20:28
Python Version: 3.9.13
-----------------------------------
