# Importing and preparing supermarkets data

## Libraries and settings

In [1]:
# Libraries
import os
import fnmatch
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

## Importing data

In [2]:
# Get current working directory
print('Current working directory:', os.getcwd())

# Show files in the directory
flist = fnmatch.filter(os.listdir('.'), '*.json')
for i in flist:
    print(i)

# Read the data to a pandas data frame
df1 = pd.read_json('supermarkets_data.json', encoding='utf-8')
df1.head(5)

Current working directory: /Users/miroduman/Desktop/data analytics/Woche_2
supermarkets_data.json


Unnamed: 0,type,id,lat,lon,tags
0,node,33126515,47.155616,9.037915,"{'brand': 'Spar', 'brand:wikidata': 'Q610492',..."
1,node,36726161,47.226191,8.980329,"{'addr:city': 'Uznach', 'addr:housenumber': '2..."
2,node,39768209,47.225069,8.969981,"{'addr:city': 'Uznach', 'addr:postcode': '8730..."
3,node,39947904,47.376732,8.542161,"{'addr:city': 'Zürich', 'addr:country': 'CH', ..."
4,node,48932835,47.37502,8.522895,"{'addr:city': 'Zürich', 'addr:housenumber': '7..."


## Count number of rows and columns in the data frame

In [3]:
# Dimension (rows, columns)
print('Dimension:', df1.shape)

# Number of rows
print('Number of rows:', df1.shape[0])

# Number of columns
print('Number of columns:', df1.shape[1])

Dimension: (3260, 5)
Number of rows: 3260
Number of columns: 5


## Column 'tags' is a pandas Series with dictionaries -> change to data frame

In [4]:
# Type of the first item of column tags
print(type(df1.tags))
print(type(df1.tags[0]))

# Content of the first item of column tags
print(df1.tags[0].keys())

# Change to data frame
df2 = pd.DataFrame.from_records(df1.tags)
df2[['brand', 'shop', 'addr:city', 'addr:housenumber', 'addr:postcode', 'opening_hours']]

<class 'pandas.core.series.Series'>
<class 'dict'>
dict_keys(['brand', 'brand:wikidata', 'brand:wikipedia', 'name', 'opening_hours', 'shop'])


Unnamed: 0,brand,shop,addr:city,addr:housenumber,addr:postcode,opening_hours
0,Spar,supermarket,,,,Mo-Th 08:00-19:00; Fr 08:00-20:00; Sa 08:00-17:00
1,Migros,supermarket,Uznach,25,8730,"Mo-Th 08:00-19:00, Fr 08:00-20:00, Sa 07:30-17..."
2,Coop,supermarket,Uznach,,8730,
3,Coop,supermarket,Zürich,1,8001,Mo-Sa 06:00-22:00
4,Migros,supermarket,Zürich,7,8004,Mo-Sa 08:00-21:00; PH off
...,...,...,...,...,...,...
3255,Volg,supermarket,,,,"Mo-Fr 07:00-12:20,13:30-18:30; Sa 07:00-16:00"
3256,Landi,supermarket,Rickenbach Sulz,1,8545,Mo-Fr 08:00-18.30; Sa 08:00-17:00; PH off
3257,VOI,supermarket,,,,
3258,,supermarket,,,,"Mo-Fr 09:00-19:00; Sa,Su 09:00-17:00; PH off"


## Join df1 and df2

In [5]:
# Merge df and df2
df = pd.merge(df1[['type', 'id', 'lat', 'lon']], 
              df2[['brand', 'shop', 'addr:city', 'addr:housenumber', 'addr:postcode', 'opening_hours']], 
              left_index=True, 
              right_index=True)
df.head(5)

Unnamed: 0,type,id,lat,lon,brand,shop,addr:city,addr:housenumber,addr:postcode,opening_hours
0,node,33126515,47.155616,9.037915,Spar,supermarket,,,,Mo-Th 08:00-19:00; Fr 08:00-20:00; Sa 08:00-17:00
1,node,36726161,47.226191,8.980329,Migros,supermarket,Uznach,25.0,8730.0,"Mo-Th 08:00-19:00, Fr 08:00-20:00, Sa 07:30-17..."
2,node,39768209,47.225069,8.969981,Coop,supermarket,Uznach,,8730.0,
3,node,39947904,47.376732,8.542161,Coop,supermarket,Zürich,1.0,8001.0,Mo-Sa 06:00-22:00
4,node,48932835,47.37502,8.522895,Migros,supermarket,Zürich,7.0,8004.0,Mo-Sa 08:00-21:00; PH off


## Count and identify the number of missing values (if any)

In [6]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values, e.g.:
df.loc[pd.isna(df['addr:city'])]

type                   0
id                     0
lat                    0
lon                    0
brand               1246
shop                   0
addr:city           1808
addr:housenumber    1702
addr:postcode       1733
opening_hours       1435
dtype: int64


Unnamed: 0,type,id,lat,lon,brand,shop,addr:city,addr:housenumber,addr:postcode,opening_hours
0,node,33126515,47.155616,9.037915,Spar,supermarket,,,,Mo-Th 08:00-19:00; Fr 08:00-20:00; Sa 08:00-17:00
5,node,60271452,47.406671,9.305450,,supermarket,,,,
6,node,70656485,47.491253,8.733981,,supermarket,,,,
10,node,81321513,47.532917,9.066408,Landi,supermarket,,,,"Mo-Sa 08:00-12:00, 13:30-18:00"
13,node,95582038,47.050385,9.059214,,supermarket,,,,
...,...,...,...,...,...,...,...,...,...,...
3253,node,9947587529,47.000852,8.611420,,supermarket,,,,
3255,node,9963973121,47.321761,9.426943,Volg,supermarket,,,,"Mo-Fr 07:00-12:20,13:30-18:30; Sa 07:00-16:00"
3257,node,9975876019,46.011774,8.965955,VOI,supermarket,,,,
3258,node,9978766657,47.424524,7.126737,,supermarket,,,,"Mo-Fr 09:00-19:00; Sa,Su 09:00-17:00; PH off"


## Count and identify duplicated values (if any)

In [7]:
# Count duplicated values
print(df.duplicated().sum())

# Identify rows with duplicated values, e.g.:
df[df[['id']].duplicated()]

0


Unnamed: 0,type,id,lat,lon,brand,shop,addr:city,addr:housenumber,addr:postcode,opening_hours


## Get data types of all variables

In [8]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

type                 object
id                    int64
lat                 float64
lon                 float64
brand                object
shop                 object
addr:city            object
addr:housenumber     object
addr:postcode        object
opening_hours        object
dtype: object

### Save data to file

In [9]:
df.to_csv('supermarkets_data_prepared.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [10]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Darwin | 21.6.0
Datetime: 2022-09-30 15:07:40
Python Version: 3.9.13
-----------------------------------


# My Task

### Filter only Migros supermarkets in the city of Zürich.

In [11]:
df_filtered = df.loc[(df['brand'] == 'Migros') & (df['addr:city'] == 'Zürich')]
df_filtered.head(5)

Unnamed: 0,type,id,lat,lon,brand,shop,addr:city,addr:housenumber,addr:postcode,opening_hours
4,node,48932835,47.37502,8.522895,Migros,supermarket,Zürich,7,8004,Mo-Sa 08:00-21:00; PH off
11,node,83330862,47.344749,8.529981,Migros,supermarket,Zürich,3,8038,Mo-Sa 07:30-21:00; PH off
16,node,119249170,47.375255,8.536107,Migros,supermarket,Zürich,31-35,8001,Mo-Sa 09:00-20:00; PH off
50,node,262400822,47.364072,8.530945,Migros,supermarket,Zürich,10,8002,"Mo-Fr 06:30-22:00; PH,Sa-Su 08:00-22:00"
71,node,267346993,47.385598,8.531471,Migros,supermarket,Zürich,152,8005,Mo-Sa 07:30-20:00; PH off


### Filter and count all Coop supermarkets in the cities of Zürich, Basel & Bern.

In [12]:
df_filtered = df.loc[(df['brand'] == 'Coop') & ((df['addr:city'] == 'Zürich') | (df['addr:city'] == 'Bern') | (df['addr:city'] == 'Basel'))]
display(df_filtered)

print('Amount of Coop supermarkets in Basel, Bern & Zurich:', df_filtered.shape[0])

Unnamed: 0,type,id,lat,lon,brand,shop,addr:city,addr:housenumber,addr:postcode,opening_hours
3,node,39947904,47.376732,8.542161,Coop,supermarket,Zürich,1,8001,Mo-Sa 06:00-22:00
9,node,79977755,47.34007,8.530546,Coop,supermarket,Zürich,15,8038,Mo-Sa 07:30-21:00
59,node,265776668,47.376417,8.559594,Coop,supermarket,Zürich,75,8044,Mo-Fr 07:30-21:00; Sa 07:30-20:00
75,node,268603429,47.36736,8.546174,Coop,supermarket,Zürich,18,8001,Mo-Fr 08:00-20:00; Sa 08:00-19:00
81,node,270692983,47.35794,8.554646,Coop,supermarket,Zürich,123,8008,Mo-Sa 07:00-22:00
84,node,271028686,47.36677,8.548146,Coop,supermarket,Zürich,10,8001,Mo-Sa 06:00-22:00; Su 07:00-22:00
96,node,276363821,47.418888,8.505699,Coop,supermarket,Zürich,549,8046,Mo-Sa 08:00-20:00
122,node,283103824,47.393648,8.529543,Coop,supermarket,Zürich,12,8037,Mo-Sa 07:00-21:00
123,node,283126967,47.392524,8.524519,Coop,supermarket,Zürich,22,8037,Mo-Sa 07:30-20:00
226,node,321361788,47.395725,8.540471,Coop,supermarket,Zürich,1-5,8057,Mo-Sa 07:30-21:00


Amount of Coop supermarkets in Basel, Bern & Zurich: 40


### Filter supermarkets with available opening hours. 

In [13]:
df_filtered['opening_hours'].dropna()
df_filtered.head(5)

Unnamed: 0,type,id,lat,lon,brand,shop,addr:city,addr:housenumber,addr:postcode,opening_hours
3,node,39947904,47.376732,8.542161,Coop,supermarket,Zürich,1,8001,Mo-Sa 06:00-22:00
9,node,79977755,47.34007,8.530546,Coop,supermarket,Zürich,15,8038,Mo-Sa 07:30-21:00
59,node,265776668,47.376417,8.559594,Coop,supermarket,Zürich,75,8044,Mo-Fr 07:30-21:00; Sa 07:30-20:00
75,node,268603429,47.36736,8.546174,Coop,supermarket,Zürich,18,8001,Mo-Fr 08:00-20:00; Sa 08:00-19:00
81,node,270692983,47.35794,8.554646,Coop,supermarket,Zürich,123,8008,Mo-Sa 07:00-22:00


### Filter supermarkets with available brand, city, house number and postcode. 

In [14]:
df_filtered = df[['brand', 'addr:city','addr:housenumber','addr:postcode']].dropna()
df_filtered

Unnamed: 0,brand,addr:city,addr:housenumber,addr:postcode
1,Migros,Uznach,25,8730
3,Coop,Zürich,1,8001
4,Migros,Zürich,7,8004
7,Migros,Winterthur,102,8406
8,ALDI,Zürich,81,8038
...,...,...,...,...
3211,Coop,Brienz BE,139b,3855
3222,Denner,Gampel,5,3945
3248,Denner,Echallens,2,1040
3254,Volg,Marbach SG,17,9437
