# Combining & organizing data

## Libraries and settings

In [2]:
# Libraries
import os
import pandas as pd
import numpy as np
import fnmatch

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Show current working directory
print(os.getcwd())

/workspaces/data_analytics/Week_03


## Importing the prepared rental apartments data

In [3]:
# Read data to pandas data frame
df_orig = pd.read_csv('apartments_data_prepared.csv', 
                      sep=',', 
                      encoding='utf-8')

# Copy of data with selected colums
columns = [ 'web-scraper-order', 
            'address_raw',
            'rooms', 
            'area', 
            'luxurious',
            'price',
            'price_per_m2']
df = df_orig[columns]

# Get number of rows and columns
print(df.shape)

# Show first records
df.head(5)

(839, 7)


Unnamed: 0,web-scraper-order,address_raw,rooms,area,luxurious,price,price_per_m2
0,1693998201-1,"Neuhusstrasse 6, 8630 Rüti ZH, ZH",3.0,49.0,0,1441.0,29.41
1,1693998201-2,"Zürcherstrasse 1, 8173 Neerach, ZH",3.5,65.0,0,1850.0,28.46
2,1693998201-4,"Cramerstrasse 8-12, 8004 Zürich, ZH",2.0,54.0,0,4853.0,89.87
3,1693998201-5,"Rotachstrasse 33, 8003 Zürich, ZH",2.0,49.0,0,4335.0,88.47
4,1693998201-16,"Wolframplatz 1, 8045 Zürich, ZH",2.0,32.0,0,3515.0,109.84


## Combining data from different sources

### Reading rental apartment data with geocoded addresses

In [4]:
# Meaning of variables
# lat: geographical latitude
# lon: geographical longitude
# bfs_number: official municipality id
# bfs_name: official municipality name

# Geocoded data (i.e. data with latitude and longitude)
df_geo = pd.read_csv('apartments_data_geocoded.csv', 
                     sep=',', 
                     encoding='utf-8')

# Rename columns
df_geo = df_geo.rename(columns={'web-scrape':'web-scraper-order',
                                'address_ra':'address_raw',
                                'BFS_NUMMER':'bfs_number', 
                                'NAME':'bfs_name'})

# Show data
df_geo[['web-scraper-order',
        'address_raw',
        'lat', 
        'lon', 
        'bfs_number', 
        'bfs_name']].head()

Unnamed: 0,web-scraper-order,address_raw,lat,lon,bfs_number,bfs_name
0,1693998201-1,"Neuhusstrasse 6, 8630 Rüti ZH, ZH",47.252171,8.845797,118,Rüti (ZH)
1,1693998201-2,"Zürcherstrasse 1, 8173 Neerach, ZH",47.513332,8.474851,88,Neerach
2,1693998201-5,"Rotachstrasse 33, 8003 Zürich, ZH",47.370792,8.514748,261,Zürich
3,1693998201-16,"Wolframplatz 1, 8045 Zürich, ZH",47.362282,8.522193,261,Zürich
4,1693998205-25,"Badenerstrasse 67, 8953 Dietikon, ZH",47.407925,8.392561,243,Dietikon


### Join geo-information to rental apartment data using .merge()

In [5]:
df2 = df.merge(df_geo[['web-scraper-order', 
                       'lat', 
                       'lon', 
                       'bfs_number', 
                       'bfs_name']], 
               on="web-scraper-order")
df2.head()

Unnamed: 0,web-scraper-order,address_raw,rooms,area,luxurious,price,price_per_m2,lat,lon,bfs_number,bfs_name
0,1693998201-1,"Neuhusstrasse 6, 8630 Rüti ZH, ZH",3.0,49.0,0,1441.0,29.41,47.252171,8.845797,118,Rüti (ZH)
1,1693998201-2,"Zürcherstrasse 1, 8173 Neerach, ZH",3.5,65.0,0,1850.0,28.46,47.513332,8.474851,88,Neerach
2,1693998201-5,"Rotachstrasse 33, 8003 Zürich, ZH",2.0,49.0,0,4335.0,88.47,47.370792,8.514748,261,Zürich
3,1693998201-16,"Wolframplatz 1, 8045 Zürich, ZH",2.0,32.0,0,3515.0,109.84,47.362282,8.522193,261,Zürich
4,1693998205-25,"Badenerstrasse 67, 8953 Dietikon, ZH",3.5,105.0,0,3200.0,30.48,47.407925,8.392561,243,Dietikon


### Reading municipality-level data

In [6]:
# Meaning of variables:
# bfs_number: official municipality id
# bfs_name: official municipality name
# pop: number of residents (=population)
# pop_dens: population density (pop per km2)
# frg_pct: percentage foreigners
# emp: numer of employees

df_municip = pd.read_excel('municipality_data.xlsx', 
                           sheet_name='data_for_import')
df_municip.head(5)

Unnamed: 0,bfs_number,bfs_name,pop,pop_dens,frg_pct,emp,mean_taxable_income
0,1.0,Aeugst am Albis,1981.0,250.442478,14.184755,442.0,105013.570634
1,2.0,Affoltern am Albis,12303.0,1161.756374,28.700317,6920.0,71059.805603
2,3.0,Bonstetten,5572.0,749.932705,16.564968,1014.0,88927.698145
3,4.0,Hausen am Albis,3751.0,275.808824,16.022394,1021.0,86300.455137
4,5.0,Hedingen,3778.0,578.56049,16.410799,1478.0,90811.20533


### Join municipality data to rental apartment data using .merge()

In [7]:
# Merge needs a key which must be identical in both data sets (here the key is 'bfs_number')
df3 = df2.merge(df_municip[['bfs_number', 
                            'pop', 
                            'pop_dens', 
                            'frg_pct', 
                            'emp',
                            'mean_taxable_income']], 
                on="bfs_number")
df3.head(10)

Unnamed: 0,web-scraper-order,address_raw,rooms,area,luxurious,price,price_per_m2,lat,lon,bfs_number,bfs_name,pop,pop_dens,frg_pct,emp,mean_taxable_income
0,1693998201-1,"Neuhusstrasse 6, 8630 Rüti ZH, ZH",3.0,49.0,0,1441.0,29.41,47.252171,8.845797,118,Rüti (ZH),12286.0,1221.272366,24.841283,5053.0,79184.009112
1,1693998201-2,"Zürcherstrasse 1, 8173 Neerach, ZH",3.5,65.0,0,1850.0,28.46,47.513332,8.474851,88,Neerach,3175.0,525.662252,13.322835,641.0,85590.272374
2,1693998201-5,"Rotachstrasse 33, 8003 Zürich, ZH",2.0,49.0,0,4335.0,88.47,47.370792,8.514748,261,Zürich,420217.0,4778.994655,32.458468,491193.0,65617.333884
3,1693998201-16,"Wolframplatz 1, 8045 Zürich, ZH",2.0,32.0,0,3515.0,109.84,47.362282,8.522193,261,Zürich,420217.0,4778.994655,32.458468,491193.0,65617.333884
4,1693998205-25,"Badenerstrasse 67, 8953 Dietikon, ZH",3.5,105.0,0,3200.0,30.48,47.407925,8.392561,243,Dietikon,27746.0,2970.663812,46.226483,18911.0,101569.727626
5,1693998205-27,"Kanzleistrasse 15, 8418 Schlatt ZH, ZH",4.5,104.0,0,2700.0,25.96,47.464199,8.845925,226,Schlatt (ZH),789.0,87.959866,7.984791,153.0,77400.242131
6,1693998205-29,"Bolletweg 14, 8934 Knonau, ZH",5.5,150.0,0,3390.0,22.6,47.226044,8.461561,7,Knonau,2378.0,367.542504,18.292683,566.0,79514.421053
7,1693998205-30,"Im Zauner 24, 8352 Elsau, ZH",3.0,70.0,0,2450.0,35.0,47.501671,8.805103,219,Elsau,3647.0,451.920694,17.329312,1052.0,77981.56682
8,1693998205-32,"Maneggstrasse 87, 8041 Zürich, ZH",3.5,98.0,0,3121.0,31.85,47.337914,8.520034,261,Zürich,420217.0,4778.994655,32.458468,491193.0,65617.333884
9,1693998205-33,"Haldenstrasse 67, 8602 Wangen b. Dübendorf, ZH",4.5,118.0,0,3390.0,28.73,47.415516,8.635633,200,Wangen-Brüttisellen,7967.0,1005.934343,27.325217,5859.0,72544.690265


### Export data to file

In [8]:
# Remove missing values which may have ocured from the merging process
df3 = df3.dropna()

# Check for missing values per column
print(df3.isna().sum())

# Count number of rows and columns
print(df3.shape)

# Export apartment data to file
df3.to_csv('apartments_data_enriched.csv',
            sep=',',
            encoding='utf-8')

web-scraper-order      0
address_raw            0
rooms                  0
area                   0
luxurious              0
price                  0
price_per_m2           0
lat                    0
lon                    0
bfs_number             0
bfs_name               0
pop                    0
pop_dens               0
frg_pct                0
emp                    0
mean_taxable_income    0
dtype: int64
(792, 16)


### Sorting data

In [9]:
# Sorting data by 'price' and 'area' with highest price above (ascending=False)
df3.sort_values(by=['price', 'area'], 
                ascending=False).head()

Unnamed: 0,web-scraper-order,address_raw,rooms,area,luxurious,price,price_per_m2,lat,lon,bfs_number,bfs_name,pop,pop_dens,frg_pct,emp,mean_taxable_income
24,1693998210-53,"Bergstrasse 16, 8955 Oetwil an der Limmat, ZH",5.5,282.0,0,6000.0,21.28,47.428917,8.398645,246,Oetwil an der Limmat,2515.0,907.942238,18.131213,295.0,69490.671885
236,1693998261-347,"8032 Zürich, ZH",4.5,146.0,0,5900.0,40.41,47.365864,8.552139,261,Zürich,420217.0,4778.994655,32.458468,491193.0,65617.333884
140,1693998236-209,"Eidmattstrasse 38, 8032 Zürich, ZH",2.5,70.0,0,5900.0,84.29,47.364975,8.560164,261,Zürich,420217.0,4778.994655,32.458468,491193.0,65617.333884
139,1693998236-208,"Rosengartenstrasse 55, 8037 Zürich, ZH",2.5,65.0,0,5900.0,90.77,47.395889,8.529163,261,Zürich,420217.0,4778.994655,32.458468,491193.0,65617.333884
85,1693998224-143,"Mühlebachstr. 28, 8008 Zürich, ZH",4.5,128.0,0,5710.0,44.61,47.364857,8.550126,261,Zürich,420217.0,4778.994655,32.458468,491193.0,65617.333884


### Aggregation of data

In [10]:
# Aggregation using .groupby()
df3[['rooms', 'price']].groupby(['rooms']).mean()

Unnamed: 0_level_0,price
rooms,Unnamed: 1_level_1
1.0,1306.346154
1.5,2092.054054
2.0,2138.775
2.5,2321.765957
3.0,1950.434783
3.5,2566.928571
4.0,2798.2
4.5,2762.58125
5.0,2773.75
5.5,3302.48


## Reshaping data

### Reshaping data using .stack() and .unstack()

In [11]:
df_sub = df3[['bfs_name', 'rooms', 'price', 'area']][:5]
print('Original shape')
print(df_sub, '\n')

df_sub_stacked = df_sub.stack()
print('Stacked')
print(df_sub_stacked, '\n')

# Using unstack
print('Unstacked (= back to original shape)')
print(df_sub_stacked.unstack())

Original shape
    bfs_name  rooms   price   area
0  Rüti (ZH)    3.0  1441.0   49.0
1    Neerach    3.5  1850.0   65.0
2     Zürich    2.0  4335.0   49.0
3     Zürich    2.0  3515.0   32.0
4   Dietikon    3.5  3200.0  105.0 

Stacked
0  bfs_name    Rüti (ZH)
   rooms             3.0
   price          1441.0
   area             49.0
1  bfs_name      Neerach
   rooms             3.5
   price          1850.0
   area             65.0
2  bfs_name       Zürich
   rooms             2.0
   price          4335.0
   area             49.0
3  bfs_name       Zürich
   rooms             2.0
   price          3515.0
   area             32.0
4  bfs_name     Dietikon
   rooms             3.5
   price          3200.0
   area            105.0
dtype: object 

Unstacked (= back to original shape)
    bfs_name rooms   price   area
0  Rüti (ZH)   3.0  1441.0   49.0
1    Neerach   3.5  1850.0   65.0
2     Zürich   2.0  4335.0   49.0
3     Zürich   2.0  3515.0   32.0
4   Dietikon   3.5  3200.0  105.0


### Reshaping data using .melt()

In [12]:
df_sub = df3[['rooms', 'price', 'area']][:5]
print('Original shape')
print(df_sub, '\n')

print('Reshaped using .melt()')
print(pd.melt(df, id_vars=['rooms'], value_vars=['price', 'area']))

Original shape
   rooms   price   area
0    3.0  1441.0   49.0
1    3.5  1850.0   65.0
2    2.0  4335.0   49.0
3    2.0  3515.0   32.0
4    3.5  3200.0  105.0 

Reshaped using .melt()
      rooms variable   value
0       3.0    price  1441.0
1       3.5    price  1850.0
2       2.0    price  4853.0
3       2.0    price  4335.0
4       2.0    price  3515.0
...     ...      ...     ...
1673    3.5     area    82.0
1674    4.0     area    73.0
1675    4.5     area   110.0
1676    3.0     area    68.0
1677    4.5     area   114.0

[1678 rows x 3 columns]


### Pivoting data using .pivot_table()

In [13]:
# Using pivot_table to reshape the data and calculate means 
pd.pivot_table(df3[['rooms', 'price', 'area', 'price_per_m2']],
               index=['rooms'],
               values=['price', 'area', 'price_per_m2'],
               aggfunc=(np.mean)
               #aggfunc=('count')
               )

Unnamed: 0_level_0,area,price,price_per_m2
rooms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,32.057692,1306.346154,50.168077
1.5,35.972973,2092.054054,64.345676
2.0,54.475,2138.775,42.20175
2.5,67.177305,2321.765957,36.047872
3.0,67.956522,1950.434783,28.567609
3.5,88.704762,2566.928571,29.10181
4.0,96.9,2798.2,29.482
4.5,112.325,2762.58125,24.71625
5.0,103.5,2773.75,26.5875
5.5,152.88,3302.48,21.9006


**ERKLÄRUNG:**

Da der Preis für eine Immobilie nicht proportional zur Fläche steigt, sondern viele Faktoren wie Lage und Ausstattung eine Rolle spielen, sinkt der Preis pro Quadratmeter, wenn die Fläche grösser wird. Das bedeutet, dass Käufer bei grösseren Wohnungen oft weniger pro Quadratmeter zahlen, weil die zusätzlichen Kosten für Bau und Grundstück auf mehr Fläche verteilt werden und diese Wohnungen nicht so stark nachgefragt sind wie kleinere Einheiten.

In [14]:
# Using pivot_table to reshape the data and calculate means 
pd.pivot_table(df3[['rooms', 'price']],
               index=['rooms'],
               values=['price'],
               aggfunc=('count')
               )

Unnamed: 0_level_0,price
rooms,Unnamed: 1_level_1
1.0,52
1.5,37
2.0,40
2.5,141
3.0,46
3.5,210
4.0,30
4.5,160
5.0,4
5.5,50


**ERKLÄRUNG:**

Die Verteilung der Wohnungen zeigt, dass im Preisbereich von 3,0 bis 3,5 die höchste Anzahl an Wohnungen mit 256 Einheiten vorhanden ist, was auf eine starke Nachfrage in diesem Segment hinweist. Auch der Bereich von 2,5 bis 3,0 ist mit 187 Wohnungen stark nachgefragt, während im Preisbereich von 7,5 bis 8,0 nur 4 Wohnungen zu finden sind, was auf ein geringeres Interesse an teureren Wohnungen hindeutet.

In [15]:
# Using pivot_table to reshape the data and calculate means 
pd.pivot_table(df3[['bfs_name', 'rooms', 'price', 'area', 'price_per_m2']],
               index=['bfs_name'],
               values=['rooms', 'price', 'area', 'price_per_m2'],
               aggfunc=(np.mean)).sort_values(by=['price', 'area'], 
                                               ascending=False).head()

Unnamed: 0_level_0,area,price,price_per_m2,rooms
bfs_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nürensdorf,506.5,4500.0,17.605,6.0
Maur,138.5,4250.0,30.445,4.5
Obfelden,127.5,4095.0,31.91,5.5
Brütten,248.5,4070.0,16.315,6.0
Zollikon,101.4,4060.8,40.254,4.0


### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [16]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.5.0-1025-azure
Datetime: 2024-10-04 20:28:03
Python Version: 3.11.10
-----------------------------------
