In [62]:
import numpy as np
import pandas as pd
import pickle

# Read file

In [63]:
path = 'files/data.csv'

In [64]:
df = pd.read_csv(path, sep=',', encoding='ISO-8859-1')

In [65]:
df.head()

Unnamed: 0,date,recipe_id,country,region,city,categories,day,hour,season
0,29.07.2013 21:14,136996.0,DE,16.0,Berlin,"['Braten (Fleisch)', 'Fleisch', 'Milchprodukte...",29/07/2013,21:14,summer
1,29.07.2013 21:14,255239.0,DE,2.0,Munich,"['Asien', 'Japan', 'Suppen/Eintoepfe', 'Vorspe...",29/07/2013,21:14,summer
2,29.07.2013 21:14,447948.0,DE,16.0,Berlin,"['kalt', 'glutenfrei', 'laktosefrei', 'ohne We...",29/07/2013,21:14,summer
3,29.07.2013 21:14,131582.0,DE,16.0,Berlin,"['glutenfrei', 'laktosefrei', 'ohne Weizen']",29/07/2013,21:14,summer
4,29.07.2013 21:15,35361.0,DE,16.0,Berlin,"['trocknen', 'Gesund und Diaet', 'laktosefrei'...",29/07/2013,21:15,summer


# Count categories

In [66]:
def get_categories_counter(categories_column):
    """ nan_indexes are not indexes from df/series but normal indexes"""
    
    import math
    from tqdm import tqdm
    
    result = dict()
    nan_indexes = list()
    i = 0
    with tqdm(total=len(categories_column)) as pbar:
        for categories in categories_column:
            i += 1
            if (type(categories)) == str:
                categories = categories.replace('[','').replace(']','').replace("'","").replace(', ',',').lower()

                for category in categories.split(','):
                    if result.get(category) != None:
                        result[category] += 1
                    else:
                        result[category] = 1
            else:
                nan_indexes += [i]
                if result.get('nan') != None:
                    result['nan'] += 1
                else:
                    result['nan'] = 1
            
            pbar.update(1)
           
    return result, nan_indexes

In [67]:
categories_counter, nan_indexes = get_categories_counter(df.categories)

100%|██████████| 1576832/1576832 [00:17<00:00, 87780.61it/s] 


In [68]:
print('Number of nan values in the dataset:', len(nan_indexes), 
      '. It is ', np.round(len(nan_indexes)/ len(df) * 100, 2),'% from the whole dataset.')

Number of nan values in the dataset: 350840 . It is  22.25 % from the whole dataset.


In [69]:
print('We have ', len(df) - len(nan_indexes), ' examples left.')

We have  1225992  examples left.


## Save

In [70]:
with open('files/to_plot/categories_count.txt', 'wb') as handle:
    pickle.dump(categories_counter, handle)

# Group data by season

In [71]:
df_by_season = df.groupby('season')

categories_season = dict()
for name, group in df_by_season:
    print(name)
    print(len(group))
    categories_season[name], nan_list = get_categories_counter(group.categories)
    print('Number of nan elements:', len(nan_list), '(', np.round(len(nan_list)/len(group)*100, 2),'%)')
    print('We have ', categories_season[name][''], '(', np.round(categories_season[name]['']/len(group)*100, 2),'%)' , 'empty categories.')

summer
913359


100%|██████████| 913359/913359 [00:09<00:00, 91828.06it/s] 


Number of nan elements: 226663 ( 24.82 %)
We have  36954 ( 4.05 %) empty categories.
winter
663473


100%|██████████| 663473/663473 [00:07<00:00, 84115.65it/s] 


Number of nan elements: 124177 ( 18.72 %)
We have  25664 ( 3.87 %) empty categories.


## Delete nan and empty values from dict
We don't want these values on charts

In [72]:
del(categories_season['winter']['nan'])
del(categories_season['summer']['nan'])

In [73]:
del(categories_season['winter'][''])
del(categories_season['summer'][''])

## Save

In [74]:
with open('files/to_plot/categories_by_season.txt', 'wb') as handle:
    pickle.dump(categories_season, handle)

## Load

In [20]:
with open('files/to_plot/categories_by_season.txt', 'rb') as handle:
    b = pickle.loads(handle.read())

# Group data by city

In [75]:
df_by_city = df.groupby('city')

categories_city = dict()
for name, group in df_by_city:
    print('\n',name)
    print(len(group))

    categories_city[name], nan_list = get_categories_counter(group.categories)
    print('Number of nan elements:', len(nan_list), '(', np.round(len(nan_list)/len(group)*100, 2),'%)')
    print('We have ', categories_city[name][''], '(', np.round(categories_city[name]['']/len(group)*100, 2),'%)' , 'empty categories.')


 Berlin
369769


100%|██████████| 369769/369769 [00:05<00:00, 73295.14it/s]


Number of nan elements: 0 ( 0.0 %)
We have  19849 ( 5.37 %) empty categories.

 Bremerhaven
7394


100%|██████████| 7394/7394 [00:00<00:00, 58523.44it/s]


Number of nan elements: 0 ( 0.0 %)
We have  380 ( 5.14 %) empty categories.

 Darmstadt
27743


100%|██████████| 27743/27743 [00:00<00:00, 32979.14it/s]


Number of nan elements: 0 ( 0.0 %)
We have  1540 ( 5.55 %) empty categories.

 Erlangen
19104


100%|██████████| 19104/19104 [00:00<00:00, 67643.50it/s]


Number of nan elements: 0 ( 0.0 %)
We have  960 ( 5.03 %) empty categories.

 Frankfurt_Am_Main
207472


100%|██████████| 207472/207472 [00:02<00:00, 82356.42it/s]


Number of nan elements: 0 ( 0.0 %)
We have  9958 ( 4.8 %) empty categories.

 Hamburg
206573


100%|██████████| 206573/206573 [00:02<00:00, 78143.79it/s]


Number of nan elements: 0 ( 0.0 %)
We have  10457 ( 5.06 %) empty categories.

 Koeln
116548


100%|██████████| 116548/116548 [00:01<00:00, 68789.84it/s]


Number of nan elements: 0 ( 0.0 %)
We have  5784 ( 4.96 %) empty categories.

 Munich
249118


100%|██████████| 249118/249118 [00:03<00:00, 78639.66it/s]


Number of nan elements: 0 ( 0.0 %)
We have  12461 ( 5.0 %) empty categories.

 Potsdam
9586


100%|██████████| 9586/9586 [00:00<00:00, 71803.27it/s]


Number of nan elements: 0 ( 0.0 %)
We have  564 ( 5.88 %) empty categories.

 Siegen
12685


100%|██████████| 12685/12685 [00:00<00:00, 73873.22it/s]


Number of nan elements: 0 ( 0.0 %)
We have  665 ( 5.24 %) empty categories.


## Delete empty values

In [76]:
cities = list(set(df.city))[1:]
print(cities)

['Munich', 'Siegen', 'Koeln', 'Bremerhaven', 'Berlin', 'Darmstadt', 'Erlangen', 'Hamburg', 'Potsdam', 'Frankfurt_Am_Main']


In [77]:
for city in cities:
    del(categories_city[city][''])

## Save

In [78]:
with open('files/to_plot/categories_by_city.txt', 'wb') as handle:
    pickle.dump(categories_city, handle)

# Find most frequent categories

In [79]:
def get_categories(categories_column):
    
    import math
    from tqdm import tqdm
    
    result = dict()
    i = 0
    with tqdm(total=len(categories_column)) as pbar:
        for categories in categories_column:
            i += 1
            if (type(categories)) == str:
                categories = categories.replace('[','').replace(']','').replace("'","").replace(', ',',').lower()

                for category in categories.split(','):
                    if result.get(category) != None:
                        result[category] += 1
                    else:
                        result[category] = 1
            else:
                if result.get('nan') != None:
                    result['nan'] += 1
                else:
                    result['nan'] = 1
            
            pbar.update(1)
           
    return result

In [80]:
c = get_categories(df.categories)

100%|██████████| 1576832/1576832 [00:18<00:00, 87467.29it/s] 


In [81]:
sorted_desc = dict(sorted(c.items(), key=operator.itemgetter(1),reverse=True))

In [82]:
del (sorted_desc[''])

In [83]:
del (sorted_desc['nan'])

### Choose frequent categories

In [84]:
top_categories = dict()
for k, v in sorted_desc.items():
    if v >= 100:
        top_categories[k] = v

In [85]:
print('There are', str(len(top_categories)), 'categories with number of examples above 100.')

There are 214 categories with number of examples above 100.


In [86]:
cat_count = 0
cat_sum = 0 
for k,v in sorted_desc.items():
    if v < 100:
        cat_count += 1
        cat_sum += v

In [87]:
print('There are ', str(cat_count), 'categories with have number of examples below 100 and they have together', 
      str(cat_sum), 'examples.')

There are  1985 categories with have number of examples below 100 and they have together 5075 examples.


## Save

In [88]:
with open('files/categories_above_100_examples.txt', 'wb') as handle:
    pickle.dump(top_categories, handle)

# Group by season and city

In [89]:
df_summer = df[df.season == 'summer']

cities_summer = dict()
df_by_city = df_summer.groupby('city')
for name, group in df_by_city:
    print(name)
    print(len(group))

    cities_summer[name], nan_list = get_categories_counter(group.categories)
    print('Number of nan elements:', len(nan_list), '(', np.round(len(nan_list)/len(group)*100, 2),'%)')
    print('We have ', cities_summer[name][''], '(', np.round(cities_summer[name]['']/len(group)*100, 2),'%)' , 'empty categories.')

Berlin
218883


100%|██████████| 218883/218883 [00:03<00:00, 71698.93it/s]


Number of nan elements: 0 ( 0.0 %)
We have  12377 ( 5.65 %) empty categories.
Bremerhaven
4373


100%|██████████| 4373/4373 [00:00<00:00, 46748.89it/s]


Number of nan elements: 0 ( 0.0 %)
We have  270 ( 6.17 %) empty categories.
Darmstadt
18242


100%|██████████| 18242/18242 [00:00<00:00, 28915.60it/s]


Number of nan elements: 0 ( 0.0 %)
We have  1050 ( 5.76 %) empty categories.
Erlangen
10908


100%|██████████| 10908/10908 [00:00<00:00, 38665.67it/s]


Number of nan elements: 0 ( 0.0 %)
We have  560 ( 5.13 %) empty categories.
Frankfurt_Am_Main
92387


100%|██████████| 92387/92387 [00:01<00:00, 50616.90it/s]


Number of nan elements: 0 ( 0.0 %)
We have  5060 ( 5.48 %) empty categories.
Hamburg
117115


100%|██████████| 117115/117115 [00:01<00:00, 63678.13it/s]


Number of nan elements: 0 ( 0.0 %)
We have  6054 ( 5.17 %) empty categories.
Koeln
69527


100%|██████████| 69527/69527 [00:00<00:00, 70692.29it/s]


Number of nan elements: 0 ( 0.0 %)
We have  3588 ( 5.16 %) empty categories.
Munich
142046


100%|██████████| 142046/142046 [00:02<00:00, 68721.27it/s]


Number of nan elements: 0 ( 0.0 %)
We have  7239 ( 5.1 %) empty categories.
Potsdam
5883


100%|██████████| 5883/5883 [00:00<00:00, 34305.22it/s]


Number of nan elements: 0 ( 0.0 %)
We have  357 ( 6.07 %) empty categories.
Siegen
7332


100%|██████████| 7332/7332 [00:00<00:00, 38008.07it/s]


Number of nan elements: 0 ( 0.0 %)
We have  399 ( 5.44 %) empty categories.


In [90]:
df_winter = df[df.season == 'winter']

cities_winter = dict()
df_by_city = df_winter.groupby('city')
for name, group in df_by_city:
    print(name)
    print(len(group))

    cities_winter[name], nan_list = get_categories_counter(group.categories)
    print('Number of nan elements:', len(nan_list), '(', np.round(len(nan_list)/len(group)*100, 2),'%)')
    print('We have ', cities_winter[name][''], '(', np.round(cities_winter[name]['']/len(group)*100, 2),'%)' , 'empty categories.')

Berlin
150886


100%|██████████| 150886/150886 [00:02<00:00, 66632.20it/s]


Number of nan elements: 0 ( 0.0 %)
We have  7472 ( 4.95 %) empty categories.
Bremerhaven
3021


100%|██████████| 3021/3021 [00:00<00:00, 36894.56it/s]


Number of nan elements: 0 ( 0.0 %)
We have  110 ( 3.64 %) empty categories.
Darmstadt
9501


100%|██████████| 9501/9501 [00:00<00:00, 59877.03it/s]


Number of nan elements: 0 ( 0.0 %)
We have  490 ( 5.16 %) empty categories.
Erlangen
8196


100%|██████████| 8196/8196 [00:00<00:00, 45776.52it/s]


Number of nan elements: 0 ( 0.0 %)
We have  400 ( 4.88 %) empty categories.
Frankfurt_Am_Main
115085


100%|██████████| 115085/115085 [00:01<00:00, 57740.74it/s]


Number of nan elements: 0 ( 0.0 %)
We have  4898 ( 4.26 %) empty categories.
Hamburg
89458


100%|██████████| 89458/89458 [00:01<00:00, 77804.59it/s]


Number of nan elements: 0 ( 0.0 %)
We have  4403 ( 4.92 %) empty categories.
Koeln
47021


100%|██████████| 47021/47021 [00:00<00:00, 70693.27it/s]


Number of nan elements: 0 ( 0.0 %)
We have  2196 ( 4.67 %) empty categories.
Munich
107072


100%|██████████| 107072/107072 [00:01<00:00, 62029.04it/s]


Number of nan elements: 0 ( 0.0 %)
We have  5222 ( 4.88 %) empty categories.
Potsdam
3703


100%|██████████| 3703/3703 [00:00<00:00, 41637.64it/s]


Number of nan elements: 0 ( 0.0 %)
We have  207 ( 5.59 %) empty categories.
Siegen
5353


100%|██████████| 5353/5353 [00:00<00:00, 41312.74it/s]


Number of nan elements: 0 ( 0.0 %)
We have  266 ( 4.97 %) empty categories.


In [91]:
for city in cities:
    try:
        del(cities_summer[city][''])
    except:
        pass
    try:
        del(cities_summer[city]['nan'])
    except:
        pass
    try:
        del(cities_winter[city][''])
    except:
        pass
    try:
        del(cities_winter[city]['nan'])
    except:
        pass

## Save

In [92]:
with open('files/to_plot/categories_by_cities_summer.txt', 'wb') as handle:
    pickle.dump(cities_summer, handle)

In [93]:
with open('files/to_plot/categories_by_cities_winter.txt', 'wb') as handle:
    pickle.dump(cities_winter, handle)