# Importing and preparing rental apartments data

## Libraries and settings

In [1]:
# Libraries
import os
import re
import time
import fnmatch
import numpy as np
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Get current working directory
print(os.getcwd())

/workspaces/data_analytics/Week_02


## Importing data

In [2]:
# Show .csv - files in the directory
flist = fnmatch.filter(os.listdir('.'), '*.csv')
for i in flist:
    print(i)

# Read the data to a pandas data frame
df = pd.read_csv('apartments_data_winterthur.csv', sep=',', encoding='utf-8')

# Show first records of data frame
df.head()

apartments_data_winterthur.csv
apartments_data_prepared.csv
apartments_data_zuerich.csv


Unnamed: 0,web-scraper-order,web-scraper-start-url,rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw
0,1693993818-1,https://www.immoscout24.ch/de/wohnung/mieten/o...,"6,5 Zimmer, 143 m², CHF 3017.—","Am Eulachpark 25, 8404 Winterthur, ZH",CHF 3017.—,«Sie suchen die spezielle Maisonettewohnung?»,"6,5 Zimmer, 143 m², CHF 3017.—Am Eulachpark 25..."
1,1693993818-2,https://www.immoscout24.ch/de/wohnung/mieten/o...,"1 Zimmer, 132 m², CHF 3260.—","Katharina Sulzer Platz 2, 8400 Winterthur, ZH",CHF 3260.—,«In Loft-iger Höhe MIETEN OHNE KAUTION»,"1 Zimmer, 132 m², CHF 3260.—Katharina Sulzer P..."
2,1693993818-3,https://www.immoscout24.ch/de/wohnung/mieten/o...,"4,5 Zimmer, 117 m², CHF 3782.—","8400 Winterthur, ZH",CHF 3782.—,"«MÖBLIERT, TEMPORÄR: 4½ ZI-WOHNUNG IN WINTERTH...","4,5 Zimmer, 117 m², CHF 3782.—8400 Winterthur,..."
3,1693993818-4,https://www.immoscout24.ch/de/wohnung/mieten/o...,"3,5 Zimmer, 88 m², CHF 2244.—","Untere Briggerstrasse 66, 8406 Winterthur, ZH",CHF 2244.—,«Modernes Leben im EG mit Gartensitzplatz»,"3,5 Zimmer, 88 m², CHF 2244.—Untere Briggerstr..."
4,1693993818-5,https://www.immoscout24.ch/de/wohnung/mieten/o...,"3,5 Zimmer, 80 m², CHF 1980.—","Wülflingerstrasse 25, 8400 Winterthur, ZH",CHF 1980.—,«Schöne 3.5-Zimmerwohnung mit Balkon zu vermie...,"3,5 Zimmer, 80 m², CHF 1980.—Wülflingerstrasse..."


## Count number of rows and columns in the data frame

In [3]:
# Dimension (rows, columns)
print('Dimension:', df.shape)

# Number of rows
print('Number of rows:', df.shape[0])

# Number of columns
print('Number of columns:', df.shape[1])

Dimension: (120, 7)
Number of rows: 120
Number of columns: 7


## Get data types (raw-format from web scraping)

In [4]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

web-scraper-order        object
web-scraper-start-url    object
rooms_area_price_raw     object
address_raw              object
price_raw                object
description_raw          object
text_raw                 object
dtype: object

## Extract and save relevant information from raw data using regular expressions (regex)

### Extract number of rooms

In [5]:
# Extract values from 'rooms_area_price_raw' strings
rooms = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall('(.*)Zimmer', i)
    try:
        d2 = d1[0].strip().replace(',', '.')
    except:
        d2 = None
    rooms.append(d2)

# Save as new variable in the pandas data frame
df['rooms'] = pd.Series(rooms, dtype="float64")
    
# Print first 5 values
print(df['rooms_area_price_raw'].head(5), '\n')
print(df['rooms'].head(5))

0    6,5 Zimmer, 143 m², CHF 3017.—
1      1 Zimmer, 132 m², CHF 3260.—
2    4,5 Zimmer, 117 m², CHF 3782.—
3     3,5 Zimmer, 88 m², CHF 2244.—
4     3,5 Zimmer, 80 m², CHF 1980.—
Name: rooms_area_price_raw, dtype: object 

0    6.5
1    1.0
2    4.5
3    3.5
4    3.5
Name: rooms, dtype: float64


### Extract living area

In [6]:
# Extract values from 'rooms_area_price_raw' strings
area = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall('Zimmer, (.*)m²', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    area.append(d2)

# Save as new variable in the pandas data frame
df['area'] = pd.Series(area, dtype="Int64")

# Print first 5 values
print(df['rooms_area_price_raw'].head(5), '\n')
print(df['area'].head(5))

0    6,5 Zimmer, 143 m², CHF 3017.—
1      1 Zimmer, 132 m², CHF 3260.—
2    4,5 Zimmer, 117 m², CHF 3782.—
3     3,5 Zimmer, 88 m², CHF 2244.—
4     3,5 Zimmer, 80 m², CHF 1980.—
Name: rooms_area_price_raw, dtype: object 

0    143
1    132
2    117
3     88
4     80
Name: area, dtype: Int64


### Extract rental price

In [7]:
# Extract values from 'price_raw' strings
price = []
for i in df['price_raw']:
    d1 = re.findall('[0-9]+', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    price.append(d2)

# Save as new variable in the pandas data frame
df['price'] = pd.Series(price, dtype="Int64")

# Print first 5 values
print(df['price_raw'].head(5), '\n')
print(df['price'].head(5))

0    CHF 3017.—
1    CHF 3260.—
2    CHF 3782.—
3    CHF 2244.—
4    CHF 1980.—
Name: price_raw, dtype: object 

0    3017
1    3260
2    3782
3    2244
4    1980
Name: price, dtype: Int64


### Get data types of all variables including the new ones

In [8]:
df.dtypes

web-scraper-order         object
web-scraper-start-url     object
rooms_area_price_raw      object
address_raw               object
price_raw                 object
description_raw           object
text_raw                  object
rooms                    float64
area                       Int64
price                      Int64
dtype: object

## Count and identify missing values (if any)

In [9]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values
df[df.isna().any(axis=1)].head()

web-scraper-order         0
web-scraper-start-url     0
rooms_area_price_raw      0
address_raw               0
price_raw                 0
description_raw           0
text_raw                  0
rooms                     1
area                     11
price                     1
dtype: int64


Unnamed: 0,web-scraper-order,web-scraper-start-url,rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw,rooms,area,price
6,1693993818-7,https://www.immoscout24.ch/de/wohnung/mieten/o...,"3,5 Zimmer, 171 m², Preis auf Anfrage","Ernst-Jung-Gasse 16B, 8400 Winterthur, ZH",Preis auf Anfrage,«Erstbezug in 3.5 Zimmer-Maisonette-Wohnung in...,"3,5 Zimmer, 171 m², Preis auf AnfrageErnst-Jun...",3.5,171.0,
13,1693993818-14,https://www.immoscout24.ch/de/wohnung/mieten/o...,"1 Zimmer, CHF 800.—","Walkestrasse 5, 8400 Winterthur, ZH",CHF 800.—,«Schönes 1 Zimmer Studio (möbliert) im zentral...,"1 Zimmer, CHF 800.—Walkestrasse 5, 8400 Winter...",1.0,,800.0
20,1693993818-21,https://www.immoscout24.ch/de/wohnung/mieten/o...,"4,5 Zimmer, CHF 2200.—","Tösstalstrasse 234, 8405 Winterthur, ZH",CHF 2200.—,«4.5 Zimmerwohnung zu vermieten in Winterthur ...,"4,5 Zimmer, CHF 2200.—Tösstalstrasse 234, 8405...",4.5,,2200.0
27,1693993822-28,https://www.immoscout24.ch/de/wohnung/mieten/o...,"3 Zimmer, CHF 1640.—","Auwiesenstrasse 13, 8406 Winterthur, ZH",CHF 1640.—,"«Auwiesenstrasse 13, 8406 Winterthur»","3 Zimmer, CHF 1640.—Auwiesenstrasse 13, 8406 W...",3.0,,1640.0
34,1693993822-35,https://www.immoscout24.ch/de/wohnung/mieten/o...,"5,5 Zimmer, CHF 2960.—","Schlosstalstrasse 219, 8408 Winterthur, ZH",CHF 2960.—,«Wohnen an der Töss»,"5,5 Zimmer, CHF 2960.—Schlosstalstrasse 219, 8...",5.5,,2960.0


## Count and identify duplicated values (if any)

In [10]:
# Count duplicated values
print(df.duplicated().sum())

# Identify rows with duplicated values, e.g.:
df[df[['web-scraper-order', 'price_raw', 'address_raw']].duplicated()]

0


Unnamed: 0,web-scraper-order,web-scraper-start-url,rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw,rooms,area,price


### Save data to file

In [11]:
df.to_csv('apartments_data_prepared.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [12]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.8.0-1014-azure
Datetime: 2024-09-25 13:18:18
Python Version: 3.11.10
-----------------------------------
