# Importing and preparing rental apartments data

## Libraries and settings

In [1]:
# Libraries
import os
import re
import time
import fnmatch
import numpy as np
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

## Importing data

In [2]:
# Get current working directory
print(os.getcwd())

# Show all files in the directory
flist = fnmatch.filter(os.listdir('.'), '*.csv')
for i in flist:
    print(i)

# Read the data to a pandas data frame
df = pd.read_csv('apartments_data_adliswil.csv', sep=',', encoding='utf-8')

/Users/miroduman/Desktop/data analytics/Woche_2
supermarkets_data_prepared.csv
apartments_data_adliswil.csv
apartments_data_prepared.csv
apartments_data_preparedA.csv


## Count number of rows and columns in the data frame

In [3]:
# Dimension (rows, columns)
print('Dimension:', df.shape)

# Number of rows
print('Number of rows:', df.shape[0])

# Number of columns
print('Number of columns:', df.shape[1])

Dimension: (23, 7)
Number of rows: 23
Number of columns: 7


## Get data types (raw-format from web scraping)

In [4]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

web-scraper-order        object
web-scraper-start-url    object
rooms_area_price_raw     object
address_raw              object
price_raw                object
description_raw          object
text_raw                 object
dtype: object

## Extract and save relevant information from raw data using regular expressions (regex)

### Extract number of rooms

In [5]:
# Extract values from 'rooms_area_price_raw' strings
rooms = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall('(.*)Zimmer', i)
    try:
        d2 = d1[0].strip().replace(',', '.')
    except:
        d2 = None
    rooms.append(d2)

# Save as new variable in the pandas data frame
df['rooms'] = pd.Series(rooms, dtype="float64")
    
# Print first 5 values
print(df['rooms_area_price_raw'].head(5), '\n')
print(df['rooms'].head(5))

0    2,5 Zimmer, 71 m², CHF 1500.—
1      4 Zimmer, 90 m², CHF 2030.—
2       1 Zimmer, 22 m², CHF 770.—
3    2,5 Zimmer, 70 m², CHF 1740.—
4    3,5 Zimmer, 70 m², CHF 1950.—
Name: rooms_area_price_raw, dtype: object 

0    2.5
1    4.0
2    1.0
3    2.5
4    3.5
Name: rooms, dtype: float64


### Extract living area

In [6]:
# Extract values from 'rooms_area_price_raw' strings
area = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall('Zimmer, (.*)m²', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    area.append(d2)

# Save as new variable in the pandas data frame
df['area'] = pd.Series(area, dtype="Int64")

# Print first 5 values
print(df['rooms_area_price_raw'].head(5), '\n')
print(df['area'].head(5))

0    2,5 Zimmer, 71 m², CHF 1500.—
1      4 Zimmer, 90 m², CHF 2030.—
2       1 Zimmer, 22 m², CHF 770.—
3    2,5 Zimmer, 70 m², CHF 1740.—
4    3,5 Zimmer, 70 m², CHF 1950.—
Name: rooms_area_price_raw, dtype: object 

0    71
1    90
2    22
3    70
4    70
Name: area, dtype: Int64


### Extract rental price

In [7]:
# Extract values from 'price_raw' strings
price = []
for i in df['price_raw']:
    d1 = re.findall('[0-9]+', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    price.append(d2)

# Save as new variable in the pandas data frame
df['price'] = pd.Series(price, dtype="Int64")

# Print first 5 values
print(df['price_raw'].head(5), '\n')
print(df['price'].head(5))

0    CHF 1500.—
1    CHF 2030.—
2     CHF 770.—
3    CHF 1740.—
4    CHF 1950.—
Name: price_raw, dtype: object 

0    1500
1    2030
2     770
3    1740
4    1950
Name: price, dtype: Int64


### Get data types of all variables including the new ones

In [8]:
df.dtypes

web-scraper-order         object
web-scraper-start-url     object
rooms_area_price_raw      object
address_raw               object
price_raw                 object
description_raw           object
text_raw                  object
rooms                    float64
area                       Int64
price                      Int64
dtype: object

## Count and identify missing values (if any)

In [9]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values
df[df.isna().any(axis=1)]

web-scraper-order        0
web-scraper-start-url    0
rooms_area_price_raw     0
address_raw              0
price_raw                0
description_raw          0
text_raw                 0
rooms                    0
area                     2
price                    0
dtype: int64


Unnamed: 0,web-scraper-order,web-scraper-start-url,rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw,rooms,area,price
17,1662029560-20,https://www.immoscout24.ch/de/immobilien/miete...,"4,5 Zimmer, CHF 3020.—","Salamanderweg 2, 8134 Adliswil, ZH",CHF 3020.—,«Nur noch wenige freie 4.5-Zimmer-Wohnungen»,"4,5 Zimmer, CHF 3020.—Salamanderweg 2, 8134 Ad...",4.5,,3020
22,1662029560-19,https://www.immoscout24.ch/de/immobilien/miete...,"3,5 Zimmer, CHF 2710.—","Salamanderweg 2, 8134 Adliswil, ZH",CHF 2710.—,«Moderne Wohnungen mit spannenden Ausbaulinien»,"3,5 Zimmer, CHF 2710.—Salamanderweg 2, 8134 Ad...",3.5,,2710


## Count and identify duplicated values (if any)

In [10]:
# Count duplicated values
print(df.duplicated().sum())

# Identify rows with duplicated values, e.g.:
df[df[['web-scraper-order', 'price_raw', 'address_raw']].duplicated()]

0


Unnamed: 0,web-scraper-order,web-scraper-start-url,rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw,rooms,area,price


### Save data to file

In [11]:
df.to_csv('apartments_data_preparedA.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [12]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Darwin | 21.6.0
Datetime: 2022-09-30 16:12:27
Python Version: 3.9.13
-----------------------------------
