# Importing and preparing supermarkets data

## Libraries and settings

In [1]:
# Libraries
import os
import fnmatch
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Get current working directory
print('Current working directory:', os.getcwd())

# Show .json files in the current working directory
flist = fnmatch.filter(os.listdir('.'), '*.json')
for i in flist:
    print(i)

Current working directory: /workspaces/data_analytics/Week_02
supermarkets.json


## Importing data

In [2]:
# Read the data to a pandas data frame
df1 = pd.read_json('supermarkets.json', encoding='utf-8')
df1.head(5)

Unnamed: 0,type,id,lat,lon,tags
0,node,33126515,47.155616,9.037915,"{'brand': 'Spar', 'brand:wikidata': 'Q610492',..."
1,node,36726161,47.226191,8.980329,"{'addr:city': 'Uznach', 'addr:housenumber': '2..."
2,node,39768209,47.225069,8.969981,"{'addr:city': 'Uznach', 'addr:postcode': '8730..."
3,node,39947904,47.376732,8.542161,"{'addr:city': 'Zürich', 'addr:country': 'CH', ..."
4,node,48932835,47.37502,8.522895,"{'addr:city': 'Zürich', 'addr:housenumber': '7..."


## Count number of rows and columns in the data frame

In [3]:
# Dimension (rows, columns)
print('Dimension:', df1.shape)

# Number of rows
print('Number of rows:', df1.shape[0])

# Number of columns
print('Number of columns:', df1.shape[1])

Dimension: (3392, 5)
Number of rows: 3392
Number of columns: 5


## Column 'tags' is a pandas Series with dictionaries -> change to data frame

In [4]:
# Type of the first item of column tags
print(type(df1.tags))
print(type(df1.tags[0]))

# Content of the first item of column tags
print(df1.tags[0].keys())

# Change to data frame
df2 = pd.DataFrame.from_records(df1.tags)
df2 = df2[['brand', 'shop', 'addr:city', 'addr:street', 'addr:housenumber', 'addr:postcode']]

# Rename selected columns
df2 = df2.rename(columns={'addr:city': 'city',
                          'addr:street':'street',
                          'addr:housenumber': 'housenumber',
                          'addr:postcode': 'postcode'})

# Show first records of data frame
df2.head()

<class 'pandas.core.series.Series'>
<class 'dict'>
dict_keys(['brand', 'brand:wikidata', 'brand:wikipedia', 'name', 'opening_hours', 'shop'])


Unnamed: 0,brand,shop,city,street,housenumber,postcode
0,Spar,supermarket,,,,
1,Migros,supermarket,Uznach,Zürcherstrasse,25.0,8730.0
2,Coop,supermarket,Uznach,,,8730.0
3,Coop,supermarket,Zürich,Bahnhofbrücke,1.0,8001.0
4,Migros,supermarket,Zürich,Wengistrasse,7.0,8004.0


## Merge df1 and df2

In [5]:
# Merge df and df2
# Merge df1 and df2 on their indices
df = pd.merge(df1[['type', 'id', 'lat', 'lon']], 
              df2[['brand', 'shop', 'city', 'street', 'housenumber', 'postcode']],
              left_index=True, 
              right_index=True)

# Display the first 5 rows of the merged dataframe
df.head(5)

Unnamed: 0,type,id,lat,lon,brand,shop,city,street,housenumber,postcode
0,node,33126515,47.155616,9.037915,Spar,supermarket,,,,
1,node,36726161,47.226191,8.980329,Migros,supermarket,Uznach,Zürcherstrasse,25.0,8730.0
2,node,39768209,47.225069,8.969981,Coop,supermarket,Uznach,,,8730.0
3,node,39947904,47.376732,8.542161,Coop,supermarket,Zürich,Bahnhofbrücke,1.0,8001.0
4,node,48932835,47.37502,8.522895,Migros,supermarket,Zürich,Wengistrasse,7.0,8004.0


## Count and identify the number of missing values (if any)

In [6]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values, e.g.:
df.loc[pd.isna(df['city'])]

type              0
id                0
lat               0
lon               0
brand          1065
shop              0
city           1777
street         1608
housenumber    1680
postcode       1709
dtype: int64


Unnamed: 0,type,id,lat,lon,brand,shop,city,street,housenumber,postcode
0,node,33126515,47.155616,9.037915,Spar,supermarket,,,,
5,node,60271452,47.406671,9.305450,,supermarket,,,,
6,node,70656485,47.491253,8.733981,,supermarket,,,,
10,node,81321513,47.532917,9.066408,Landi,supermarket,,,,
13,node,95582038,47.050385,9.059214,,supermarket,,,,
...,...,...,...,...,...,...,...,...,...,...
3384,node,11083317088,46.862184,9.531169,Lidl,supermarket,,,,
3386,node,11098091830,46.205111,6.130174,Coop,supermarket,,,,
3387,node,11099817248,46.928691,7.561873,,supermarket,,,,
3388,node,11103235832,46.166742,8.771970,Migros,supermarket,,,,


## Count and identify duplicated values (if any)

In [7]:
# Count duplicated values
print(df.duplicated().sum())

# Identify rows with duplicated values, e.g.:
df[df[['id']].duplicated()]

0


Unnamed: 0,type,id,lat,lon,brand,shop,city,street,housenumber,postcode


## Get data types of all variables

In [8]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

type            object
id               int64
lat            float64
lon            float64
brand           object
shop            object
city            object
street          object
housenumber     object
postcode        object
dtype: object

# Additional filters on supermarkets

In [25]:
#Aufgabe 3e
df_filtered = df.loc[(df['brand'] == 'Coop') & (df['city'] == 'Zürich')]
count_zurich_coop = df_filtered.shape[0]
print('Number of supermarkets in Zürich Coop:', count_zurich_coop)

Number of supermarkets in Zürich Coop: 37


In [29]:
#Aufgabe 3f
df_filtered_migros = df.loc[(df['brand'] == 'Migros') & (df['city'] == 'Zürich')]
count_zurich = df_filtered_migros.shape[0]
print('Number of supermarkets in Zürich Migros:', count_zurich)
df_filtered_migros.head

Number of supermarkets in Zürich Migros: 30


<bound method NDFrame.head of       type          id        lat       lon   brand         shop    city  \
4     node    48932835  47.375020  8.522895  Migros  supermarket  Zürich   
11    node    83330862  47.344749  8.529981  Migros  supermarket  Zürich   
16    node   119249170  47.375255  8.536107  Migros  supermarket  Zürich   
50    node   262400822  47.364072  8.530945  Migros  supermarket  Zürich   
71    node   267346993  47.385598  8.531471  Migros  supermarket  Zürich   
82    node   270958272  47.358367  8.554074  Migros  supermarket  Zürich   
83    node   271028298  47.365678  8.548041  Migros  supermarket  Zürich   
85    node   271029206  47.364596  8.553846  Migros  supermarket  Zürich   
89    node   273942728  47.357610  8.571369  Migros  supermarket  Zürich   
192   node   310133197  47.419522  8.548286  Migros  supermarket  Zürich   
208   node   312838980  47.379200  8.508799  Migros  supermarket  Zürich   
224   node   321361643  47.392553  8.538428  Migros  super

In [31]:
#Aufgabe3g
df_filtered = df.loc[(df['brand'] == 'Coop') & (df['city'] == 'Zürich')]
zuerich_count = df_filtered.shape[0]
print("Zuerich: ", zuerich_count)
 
 
df_filtered = df.loc[(df['brand'] == 'Coop') & (df['city'] == 'Bern')]
bern_count = df_filtered.shape[0]
print("Bern: ", bern_count)
 
df_filtered = df.loc[(df['brand'] == 'Coop') & (df['city'] == 'Basel')]
basel_count = df_filtered.shape[0]
print("Basel: ", basel_count)

print("Total: ", zuerich_count + bern_count + basel_count)

Zuerich:  37
Bern:  5
Basel:  10
Total:  52


In [33]:
#Aufgabe 3h
df_filtered1 = df.loc[(df['brand'] == 'Coop') & (df['city'] == 'Zürich')]
df_table = df_filtered1[['brand', 'city', 'street', 'housenumber', 'postcode']]
df_table.head()

Unnamed: 0,brand,city,street,housenumber,postcode
3,Coop,Zürich,Bahnhofbrücke,1,8001
9,Coop,Zürich,Alte Kalchbühlstrasse,15,8038
59,Coop,Zürich,Zürichbergstrasse,75,8044
63,Coop,Zürich,Badenerstrasse,333,8003
70,Coop,Zürich,Maagplatz,1,8005


In [47]:
# Aufgabe 3i
df_filtered1 = df.loc[(df['brand'] == 'Coop') & (df['city'] == 'Zürich')]
df_table = df_filtered1[['brand', 'city', 'street', 'housenumber', 'postcode', 'opening_hours']]
print(df_table.head())

# Überprüfen, ob der DataFrame korrekt erstellt wurde
if not df_table.empty:
    print("DataFrame erfolgreich erstellt.")
else:
    print("Fehler: Der DataFrame ist leer.")

KeyError: "['opening_hours'] not in index"

### Save data to file

In [9]:
df.to_csv('supermarkets_data_prepared.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [10]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.8.0-1014-azure
Datetime: 2024-09-27 08:29:46
Python Version: 3.11.10
-----------------------------------
