In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline

# Air_Store_Info.csv

In [2]:
df = pd.read_csv("../data/air_store_info.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 829 entries, 0 to 828
Data columns (total 5 columns):
air_store_id      829 non-null object
air_genre_name    829 non-null object
air_area_name     829 non-null object
latitude          829 non-null float64
longitude         829 non-null float64
dtypes: float64(2), object(3)
memory usage: 32.5+ KB


In [4]:
df.head()

Unnamed: 0,air_store_id,air_genre_name,air_area_name,latitude,longitude
0,air_0f0cdeee6c9bf3d7,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
1,air_7cc17a324ae5c7dc,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
2,air_fee8dcf4d619598e,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
3,air_a17f0778617c76e2,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
4,air_83db5aff8f50478e,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599


In [5]:
df.describe()

Unnamed: 0,latitude,longitude
count,829.0,829.0
mean,35.647042,137.415311
std,2.084659,3.650971
min,33.211967,130.195555
25%,34.695124,135.341564
50%,35.658068,139.685474
75%,35.694003,139.751599
max,44.020632,144.273398


In [6]:
df.isnull().values.any()

False

Data is clean and it does not contain any missing values.

In [7]:
# Check if there are duplicated Store Ids
df[df.duplicated(['air_store_id'])]

Unnamed: 0,air_store_id,air_genre_name,air_area_name,latitude,longitude


Apparently dataset is clean and it does not contain any duplicated entry :) That's great!

## air_genre_name

In [8]:
df['air_genre_name'].value_counts()

Izakaya                         197
Cafe/Sweets                     181
Dining bar                      108
Italian/French                  102
Bar/Cocktail                     79
Japanese food                    63
Other                            27
Yakiniku/Korean food             23
Western food                     16
Okonomiyaki/Monja/Teppanyaki     14
Creative cuisine                 13
International cuisine             2
Asian                             2
Karaoke/Party                     2
Name: air_genre_name, dtype: int64

## air_area_name

In [9]:
df['air_area_name'].value_counts()

Fukuoka-ken Fukuoka-shi Daimyō                   64
Tōkyō-to Shibuya-ku Shibuya                      58
Tōkyō-to Minato-ku Shibakōen                     51
Tōkyō-to Shinjuku-ku Kabukichō                   39
Tōkyō-to Setagaya-ku Setagaya                    30
Tōkyō-to Chūō-ku Tsukiji                         29
Ōsaka-fu Ōsaka-shi Ōgimachi                      25
Hiroshima-ken Hiroshima-shi Kokutaijimachi       23
Tōkyō-to Meguro-ku Kamimeguro                    22
Hokkaidō Sapporo-shi Minami 3 Jōnishi            21
Tōkyō-to Suginami-ku Asagayaminami               21
Tōkyō-to Chiyoda-ku Kudanminami                  20
Ōsaka-fu Ōsaka-shi Kyūtarōmachi                  19
Hyōgo-ken Kōbe-shi Kumoidōri                     17
Miyagi-ken Sendai-shi Kamisugi                   17
Tōkyō-to Taitō-ku Higashiueno                    16
Fukuoka-ken Fukuoka-shi Hakata Ekimae            16
Tōkyō-to Chūō-ku Ginza                           14
Hokkaidō Asahikawa-shi 6 Jōdōri                  13
Shizuoka-ken

**INFO:** The `air_area_name_prefecture` column contains information about location: prefecture, capital or ward, and area. Thus we can split that information into separate columns. 

In [10]:
df['air_area_name_prefecture'] = df['air_area_name'].str.split('\s+').str[0]
df['air_area_name_ward'] = df['air_area_name'].str.split('\s+').str[1]
df['air_area_name_area'] = df['air_area_name'].str.split('\s+').str[2]

In [11]:
df.head()

Unnamed: 0,air_store_id,air_genre_name,air_area_name,latitude,longitude,air_area_name_prefecture,air_area_name_ward,air_area_name_area
0,air_0f0cdeee6c9bf3d7,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,Hyōgo-ken,Kōbe-shi,Kumoidōri
1,air_7cc17a324ae5c7dc,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,Hyōgo-ken,Kōbe-shi,Kumoidōri
2,air_fee8dcf4d619598e,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,Hyōgo-ken,Kōbe-shi,Kumoidōri
3,air_a17f0778617c76e2,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,Hyōgo-ken,Kōbe-shi,Kumoidōri
4,air_83db5aff8f50478e,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,Tōkyō-to,Minato-ku,Shibakōen


In [12]:
df['air_area_name_prefecture'].value_counts()

Tōkyō-to         444
Fukuoka-ken      127
Ōsaka-fu          74
Hyōgo-ken         57
Hokkaidō          46
Hiroshima-ken     32
Shizuoka-ken      18
Miyagi-ken        17
Niigata-ken       14
Name: air_area_name_prefecture, dtype: int64

# IDEAS:
* to make statistics about trends for visits per restaurant genre type
* to visualize restaurants on map. Examples:
    * https://stackoverflow.com/questions/45388412/what-is-the-fastest-way-to-plot-coordinates-on-map-inline-jupyter
    * http://bokeh.pydata.org/en/latest/docs/user_guide/geo.html
    * https://github.com/pbugnion/gmaps)
* to calculate distance from the restaurant to the city center and see if this is meaningful