# Importing and preparing supermarkets data

## Libraries and settings

In [42]:
# Libraries
import os
import fnmatch
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Get current working directory
print('Current working directory:', os.getcwd())

# Show .json files in the current working directory
flist = fnmatch.filter(os.listdir('.'), '*.json')
for i in flist:
    print(i)

Current working directory: /workspaces/data_analytics/Week_02
supermarkets.json


## Importing data

In [43]:
# Read the data to a pandas data frame
df1 = pd.read_json('supermarkets.json', encoding='utf-8')
df1.head(5)

Unnamed: 0,type,id,lat,lon,tags
0,node,33126515,47.155616,9.037915,"{'brand': 'Spar', 'brand:wikidata': 'Q610492',..."
1,node,36726161,47.226191,8.980329,"{'addr:city': 'Uznach', 'addr:housenumber': '2..."
2,node,39768209,47.225069,8.969981,"{'addr:city': 'Uznach', 'addr:postcode': '8730..."
3,node,39947904,47.376732,8.542161,"{'addr:city': 'Zürich', 'addr:country': 'CH', ..."
4,node,48932835,47.37502,8.522895,"{'addr:city': 'Zürich', 'addr:housenumber': '7..."


## Count number of rows and columns in the data frame

In [44]:
# Dimension (rows, columns)
print('Dimension:', df1.shape)

# Number of rows
print('Number of rows:', df1.shape[0])

# Number of columns
print('Number of columns:', df1.shape[1])

Dimension: (3392, 5)
Number of rows: 3392
Number of columns: 5


## Column 'tags' is a pandas Series with dictionaries -> change to data frame

In [45]:
# Type of the first item of column tags
print(type(df1.tags))
print(type(df1.tags[0]))

# Content of the first item of column tags
print(df1.tags[0].keys())

# Change to data frame
df2 = pd.DataFrame.from_records(df1.tags)
df2 = df2[['brand', 'shop', 'addr:city', 'addr:street', 'addr:housenumber', 'addr:postcode', 'opening_hours']]

# Rename selected columns
df2 = df2.rename(columns={'addr:city': 'city',
                          'addr:street':'street',
                          'addr:housenumber': 'housenumber',
                          'addr:postcode': 'postcode'})

# Show first records of data frame
df2.head()

<class 'pandas.core.series.Series'>
<class 'dict'>
dict_keys(['brand', 'brand:wikidata', 'brand:wikipedia', 'name', 'opening_hours', 'shop'])


Unnamed: 0,brand,shop,city,street,housenumber,postcode,opening_hours
0,Spar,supermarket,,,,,Mo-Th 08:00-19:00; Fr 08:00-20:00; Sa 08:00-17:00
1,Migros,supermarket,Uznach,Zürcherstrasse,25.0,8730.0,"Mo-Th 08:00-19:00, Fr 08:00-20:00, Sa 07:30-17..."
2,Coop,supermarket,Uznach,,,8730.0,
3,Coop,supermarket,Zürich,Bahnhofbrücke,1.0,8001.0,Mo-Sa 06:00-22:00
4,Migros,supermarket,Zürich,Wengistrasse,7.0,8004.0,Mo-Sa 08:00-21:00; PH off


## Merge df1 and df2

In [46]:
# Merge df and df2
df = pd.merge(df1[['type', 'id', 'lat', 'lon']], 
              df2[['brand', 'shop', 'city', 'street', 'housenumber', 'postcode', 'opening_hours']],
              left_index=True, 
              right_index=True)
df.head(5)

Unnamed: 0,type,id,lat,lon,brand,shop,city,street,housenumber,postcode,opening_hours
0,node,33126515,47.155616,9.037915,Spar,supermarket,,,,,Mo-Th 08:00-19:00; Fr 08:00-20:00; Sa 08:00-17:00
1,node,36726161,47.226191,8.980329,Migros,supermarket,Uznach,Zürcherstrasse,25.0,8730.0,"Mo-Th 08:00-19:00, Fr 08:00-20:00, Sa 07:30-17..."
2,node,39768209,47.225069,8.969981,Coop,supermarket,Uznach,,,8730.0,
3,node,39947904,47.376732,8.542161,Coop,supermarket,Zürich,Bahnhofbrücke,1.0,8001.0,Mo-Sa 06:00-22:00
4,node,48932835,47.37502,8.522895,Migros,supermarket,Zürich,Wengistrasse,7.0,8004.0,Mo-Sa 08:00-21:00; PH off


## Count and identify the number of missing values (if any)

In [47]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values, e.g.:
df.loc[pd.isna(df['city'])]

type                0
id                  0
lat                 0
lon                 0
brand            1065
shop                0
city             1777
street           1608
housenumber      1680
postcode         1709
opening_hours    1361
dtype: int64


Unnamed: 0,type,id,lat,lon,brand,shop,city,street,housenumber,postcode,opening_hours
0,node,33126515,47.155616,9.037915,Spar,supermarket,,,,,Mo-Th 08:00-19:00; Fr 08:00-20:00; Sa 08:00-17:00
5,node,60271452,47.406671,9.305450,,supermarket,,,,,
6,node,70656485,47.491253,8.733981,,supermarket,,,,,
10,node,81321513,47.532917,9.066408,Landi,supermarket,,,,,"Mo-Sa 08:00-12:00, 13:30-18:00"
13,node,95582038,47.050385,9.059214,,supermarket,,,,,Mo-Su 09:00-21:00;PH off
...,...,...,...,...,...,...,...,...,...,...,...
3384,node,11083317088,46.862184,9.531169,Lidl,supermarket,,,,,
3386,node,11098091830,46.205111,6.130174,Coop,supermarket,,,,,
3387,node,11099817248,46.928691,7.561873,,supermarket,,,,,Mo-Th 08:00-19:00; Fr 08:00-20:00; Sa 07:30-17:00
3388,node,11103235832,46.166742,8.771970,Migros,supermarket,,,,,Mo-Fr 07:30-19:00; Th 07:30-20:00; Sa 07:30-18:30


## Count and identify duplicated values (if any)

In [48]:
# Count duplicated values
print(df.duplicated().sum())

# Identify rows with duplicated values, e.g.:
df[df[['id']].duplicated()]

0


Unnamed: 0,type,id,lat,lon,brand,shop,city,street,housenumber,postcode,opening_hours


## Get data types of all variables

In [49]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

type              object
id                 int64
lat              float64
lon              float64
brand             object
shop              object
city              object
street            object
housenumber       object
postcode          object
opening_hours     object
dtype: object

### Save data to file

In [50]:
df.to_csv('supermarkets_data_prepared.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [51]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.8.0-1014-azure
Datetime: 2024-09-26 15:55:48
Python Version: 3.11.10
-----------------------------------


### Additional filters on supermarkets

In [52]:
# All Migros supermarkets in Zürich
df_filtered = df.loc[(df['brand'] == 'Migros') & (df['city'] == 'Zürich')]
print(df_filtered.head())

# All Coop supermarkets in Zürich, Basel and Bern
df_filtered = df.loc[(df['brand'] == 'Coop') & ((df['city'] == 'Zürich') | (df['city'] == 'Basel') | (df['city'] == 'Bern'))]
print(df_filtered.head())
print('Number of Coop supermarkets in Zürich, Basel and Bern:', df_filtered.shape[0])

# All Migros supermarkets in Zürich
df_filtered = df.loc[(df['brand'] == 'Migros') & (df['city'] == 'Zürich') &
                     (df['housenumber'] == '152') & (df['postcode'] == '8005') & (df['opening_hours'] == 'Mo-Sa 07:30-20:00; PH off')]
print(df_filtered.head())

    type         id        lat       lon   brand         shop    city  \
4   node   48932835  47.375020  8.522895  Migros  supermarket  Zürich   
11  node   83330862  47.344749  8.529981  Migros  supermarket  Zürich   
16  node  119249170  47.375255  8.536107  Migros  supermarket  Zürich   
50  node  262400822  47.364072  8.530945  Migros  supermarket  Zürich   
71  node  267346993  47.385598  8.531471  Migros  supermarket  Zürich   

           street housenumber postcode  \
4    Wengistrasse           7     8004   
11   Etzelstrasse           3     8038   
16   Löwenstrasse       31-35     8001   
50  Tessinerplatz          10     8002   
71  Limmatstrasse         152     8005   

                              opening_hours  
4                 Mo-Sa 08:00-21:00; PH off  
11                Mo-Sa 07:30-21:00; PH off  
16                Mo-Sa 09:00-20:00; PH off  
50  Mo-Fr 06:30-22:00; PH,Sa-Su 08:00-22:00  
71                Mo-Sa 07:30-20:00; PH off  
    type         id        lat  