# **Get Data and Import Necessary Libraries**

In [1]:
# get data from github
!git clone https://github.com/AndreaJJCC/CategorySuggestion.git
  
# Import necessary libraries
import nltk
nltk.download('punkt')
import json
import pandas as pd
import re 
import os
import numpy as np
import matplotlib.pyplot as plt
from operator import itemgetter

fatal: destination path 'CategorySuggestion' already exists and is not an empty directory.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Define file paths
main_dir = '/content/CategorySuggestion/'
business_dir = main_dir + 'yelp_academic_dataset_business.json'
categories_dir = main_dir + 'categories.json'

# Unzip business json file
os.chdir('/content/CategorySuggestion/')
!unzip -o /content/CategorySuggestion/yelp_academic_dataset_business.zip

Archive:  /content/CategorySuggestion/yelp_academic_dataset_business.zip
  inflating: yelp_academic_dataset_business.json  


# **Define Helper Functions**

In [0]:
# Define function to load files
# Input: file directory
# Output: dictionary of json objects
def load_data( directory):
  with open(directory) as f:
    data = []
    for line in f:
      data.append(json.loads(line))
  return data

In [0]:
# Define function to convert text to lowercase and  remove punctuation
def normalize( col ):
  return col.astype(str)\
            .str.lower()\
            .str.replace('[^a-z,0-9]', '')\
            .str.replace(',', ' ')

# Load Data

In [5]:
# Load business records as dictionaries
# and convert dictionaries to pandas dataframe
business_df = pd.DataFrame.from_dict(load_data(business_dir))
business_records = business_df.shape[0]
print('Business data\n' + 'Loaded ' + str(business_records) + ' records.')
business_df.head(2) 


Business data
Loaded 188593 records.


Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,1314 44 Avenue NE,"{'BikeParking': 'False', 'BusinessAcceptsCredi...",Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",Calgary,"{'Monday': '8:30-17:0', 'Tuesday': '11:0-21:0'...",1,51.091813,-114.031675,Minhas Micro Brewery,,T2E 6L6,24,4.0,AB
1,,"{'Alcohol': 'none', 'BikeParking': 'False', 'B...",AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",Henderson,"{'Friday': '17:0-23:0', 'Saturday': '17:0-23:0...",0,35.960734,-114.939821,CK'S BBQ & Catering,,89002,3,4.5,NV


In [6]:
with open(categories_dir) as f: 
    data = json.load(f)
categories_df = pd.DataFrame.from_dict(data)
categories_df.head(5)

Unnamed: 0,alias,country_blacklist,country_whitelist,parents,title
0,3dprinting,,,[localservices],3D Printing
1,abruzzese,,[IT],[italian],Abruzzese
2,absinthebars,,[CZ],[bars],Absinthe Bars
3,acaibowls,"[PL, IT, TR, CL, AR, MX]",,[food],Acai Bowls
4,accessories,,,[fashion],Accessories


# **Preprocess: Normalize**

In [7]:
# Convert categories column to lowercase
business_df['categories'] = normalize(business_df.categories)
print(business_df.categories[0:4])

0    tours breweries pizza restaurants food hotelst...
1    chickenwings burgers caterers streetvendors ba...
2    breakfastbrunch restaurants french sandwiches ...
3                          insurance financialservices
Name: categories, dtype: object


In [8]:
# Tokenize categories column
business_df['categories'] = business_df['categories'].apply(nltk.word_tokenize)
print(business_df.categories[0:4])

0    [tours, breweries, pizza, restaurants, food, h...
1    [chickenwings, burgers, caterers, streetvendor...
2    [breakfastbrunch, restaurants, french, sandwic...
3                       [insurance, financialservices]
Name: categories, dtype: object


In [9]:
categories_df = categories_df.drop( columns = ['country_blacklist', 'country_whitelist'])
categories_df['parents'] = normalize( categories_df.parents)
categories_df['title'] = normalize( categories_df.title)
categories_df.head(10)

Unnamed: 0,alias,parents,title
0,3dprinting,localservices,3dprinting
1,abruzzese,italian,abruzzese
2,absinthebars,bars,absinthebars
3,acaibowls,food,acaibowls
4,accessories,fashion,accessories
5,accountants,professional,accountants
6,acnetreatment,beautysvc,acnetreatment
7,active,,activelife
8,acupuncture,health,acupuncture
9,addictionmedicine,physicians,addictionmedicine


# **Explore Distribution**

In [10]:
# Create a flat list of categories from all records/businesses
# For each record, iterate over its categories list and append each category
# to 'flat_list'
flat_list = []
for sublist in business_df.categories:
    for item in sublist:
        flat_list.append(item)
print('Size of flat list is', len(flat_list))

Size of flat list is 739563


In [11]:
# Get a distribution count of categories
# For each category in 'flat_list', check if item is in 'distribution' dictionary
# if the category doesn't exist, add it to the dictionary, else add 1 to its value
distribution = dict()
for item in flat_list:
    if item not in distribution:
      distribution[item] = 1
    else:
      distribution[item] += 1

# Print the number of different categories in the distribution
print('Size of distribution list is ', len(distribution))
# Check that the number of items in distribution is same as flat_list length
print('Number of items in distribution is ', sum(distribution.values()))
print(distribution)

Size of distribution list is  1306
Number of items in distribution is  739563
{'tours': 639, 'breweries': 508, 'pizza': 6603, 'restaurants': 57173, 'food': 27118, 'hotelstravel': 5808, 'chickenwings': 2538, 'burgers': 5126, 'caterers': 2184, 'streetvendors': 304, 'barbeque': 1716, 'foodtrucks': 804, 'eventplanningservices': 9774, 'breakfastbrunch': 5023, 'french': 977, 'sandwiches': 6912, 'cafes': 3056, 'insurance': 869, 'financialservices': 2932, 'homegarden': 6020, 'nurseriesgardening': 615, 'shopping': 30231, 'localservices': 12906, 'automotive': 12656, 'electronicsrepair': 611, 'coffeetea': 6936, 'bakeries': 3509, 'thai': 1393, 'mexican': 4419, 'flowersgifts': 2058, 'giftshops': 718, 'japanese': 2566, 'cajuncreole': 274, 'southern': 493, 'bars': 10853, 'sportsbars': 2055, 'divebars': 580, 'nightlife': 12438, 'pakistani': 397, 'indian': 1417, 'middleeastern': 1182, 'beautyspas': 18967, 'barbers': 1932, 'delis': 1835, 'americantraditional': 6659, 'tapassmallplates': 428, 'poutineries

In [12]:
# Sort the list in descending order of the counts for each category label
sorted_list = sorted(distribution.items(), key=itemgetter(1), reverse = True)
sorted_distribution = dict(sorted_list)
print(sorted_distribution)

{'restaurants': 57173, 'shopping': 30231, 'food': 27118, 'beautyspas': 18967, 'homeservices': 18634, 'healthmedical': 16157, 'localservices': 12906, 'automotive': 12656, 'nightlife': 12438, 'bars': 10853, 'eventplanningservices': 9774, 'activelife': 9119, 'fashion': 7406, 'coffeetea': 6936, 'sandwiches': 6912, 'hairsalons': 6825, 'fastfood': 6812, 'americantraditional': 6659, 'pizza': 6603, 'homegarden': 6020, 'autorepair': 5877, 'hotelstravel': 5808, 'artsentertainment': 5794, 'professionalservices': 5620, 'doctors': 5450, 'realestate': 5295, 'burgers': 5126, 'breakfastbrunch': 5023, 'nailsalons': 4839, 'fitnessinstruction': 4559, 'italian': 4550, 'mexican': 4419, 'specialtyfood': 4304, 'chinese': 4247, 'americannew': 4230, 'pets': 4001, 'hairremoval': 3766, 'bakeries': 3509, 'dentists': 3436, 'grocery': 3380, 'skincare': 3230, 'cafes': 3056, 'education': 3033, 'desserts': 2991, 'financialservices': 2932, 'contractors': 2853, 'womensclothing': 2817, 'petservices': 2761, 'generaldentis

**Get Hierarchy**

In [16]:
level_1 = dict()
level_1list = []
roots_df = pd.DataFrame(columns = categories_df.columns.values)
for item in range(categories_df.shape[0]):
  if categories_df.iloc[item].parents == '':
    roots_df.loc[len(roots_df)] = categories_df.iloc[item]
    level_1[categories_df.iloc[item].title] = 0
print('Level 1 before filtering labels')
print(level_1)
##################################################
# pick categories to ignore and remove from list #
##################################################

ignore_roots = ['activelife', 'artsentertainment', 'bicycles', 'education', 'eventplanningservices', 'financialservices', 'hotelstravel', 'localflavor', 'massmedia', 'pets', 'professionalservices', 'publicservicesgovernment', 'religiousorganizations']
# Drop roots from dataframe
roots_df = roots_df[~roots_df.title.isin( ignore_roots) ]
# Remove roots from dictionary
for label in ignore_roots:
  level_1.pop(label)

level_1list = list(roots_df.title)
print('Level 1 after filtering labels')
print(level_1)
print('Level 1 list')
print(level_1list)
print('Roots dataframe')
roots_df

Level 1 before filtering labels
{'activelife': 0, 'artsentertainment': 0, 'automotive': 0, 'beautyspas': 0, 'bicycles': 0, 'education': 0, 'eventplanningservices': 0, 'financialservices': 0, 'food': 0, 'healthmedical': 0, 'homeservices': 0, 'hotelstravel': 0, 'localflavor': 0, 'localservices': 0, 'massmedia': 0, 'nightlife': 0, 'pets': 0, 'professionalservices': 0, 'publicservicesgovernment': 0, 'religiousorganizations': 0, 'restaurants': 0, 'shopping': 0}
Level 1 after filtering labels
{'automotive': 0, 'beautyspas': 0, 'food': 0, 'healthmedical': 0, 'homeservices': 0, 'localservices': 0, 'nightlife': 0, 'restaurants': 0, 'shopping': 0}
Level 1 list
['automotive', 'beautyspas', 'food', 'healthmedical', 'homeservices', 'localservices', 'nightlife', 'restaurants', 'shopping']
Roots dataframe


Unnamed: 0,alias,parents,title
2,auto,,automotive
3,beautysvc,,beautyspas
8,food,,food
9,health,,healthmedical
10,homeservices,,homeservices
13,localservices,,localservices
15,nightlife,,nightlife
20,restaurants,,restaurants
21,shopping,,shopping


In [17]:
level_2_df = categories_df.loc[categories_df['parents'].isin(level_1list)]
level_2_df.head(5)

Unnamed: 0,alias,parents,title
0,3dprinting,localservices,3dprinting
3,acaibowls,food,acaibowls
10,adoptionservices,localservices,adoptionservices
11,adult,shopping,adult
13,adultentertainment,nightlife,adultentertainment


In [21]:
children_df = pd.DataFrame(columns = categories_df.columns.values)
level1children = { parent: [] for parent in level_1.keys() }
for item in range(level_2_df.shape[0]):
  if level_2_df.iloc[item].parents in level_1:
    level_1[level_2_df.iloc[item].parents] = level_1[level_2_df.iloc[item].parents] + 1
    level1children[level_2_df.iloc[item].parents].append(level_2_df.iloc[item].title)
    children_df.loc[len(children_df)] = level_2_df.iloc[item]
print('Number of level 1 children')
print(level_1)
print(level1children)

Number of level 1 children
{'automotive': 0, 'beautyspas': 0, 'food': 198, 'healthmedical': 0, 'homeservices': 243, 'localservices': 265, 'nightlife': 42, 'restaurants': 573, 'shopping': 240}
{'automotive': [], 'beautyspas': [], 'food': ['acaibowls', 'backshop', 'bagels', 'bakeries', 'beer winespirits', 'bento', 'beveragestore', 'breweries', 'bubbletea', 'butcher', 'patisseriecakeshop', 'chimneycakes', 'churros', 'cideries', 'coffeetea', 'coffeeroasteries', 'coffeeteasupplies', 'conveniencestores', 'csa', 'cupcakes', 'customcakes', 'delicatessen', 'desserts', 'distilleries', 'doityourselffood', 'donairs', 'donuts', 'empanadas', 'ethicalgrocery', 'farmersmarket', 'fishmonger', 'fooddeliveryservices', 'foodtrucks', 'friterie', 'gelato', 'mulledwine', 'specialtyfood', 'grocery', 'hawkercentre', 'honey', 'icecreamfrozenyogurt', 'importedfood', 'internetcafes', 'internationalgrocery', 'japanesesweets', 'juicebarssmoothies', 'kombucha', 'meaderies', 'milkshakebars', 'nasilemak', 'organicstor

In [22]:
children_df

Unnamed: 0,alias,parents,title
0,3dprinting,localservices,3dprinting
1,acaibowls,food,acaibowls
2,adoptionservices,localservices,adoptionservices
3,adult,shopping,adult
4,adultentertainment,nightlife,adultentertainment
5,afghani,restaurants,afghan
6,african,restaurants,african
7,airductcleaning,localservices,airductcleaning
8,andalusian,restaurants,andalusian
9,antiques,shopping,antiques


In [0]:
# Filter/ Drop level 2 categories


# **IGNORE**

In [0]:
level1 = ['restaurants', 'shopping', 'food', 'beautyspas', 'homeservices', 'healthmedical', 'localservices', 'automotive', 'nightlife']
level2 = ['3dprinting', 'acaibowls', 'adoptionservices', 'adult', 'adultentertainment', 'afghan', 'african', 'airductcleaning', 'andalusian', 'antiques', 'appraisalservices', 'arabian', 'argentine', 'armenian', 'artificialturf', 'artinstallation', 'artrestoration', 'artscrafts', 'asianfusion', 'asturian', 'auctionhouses', 'australian', 'austrian', 'awnings', 'babygearfurniture', 'backshop', 'bagels', 'baguettes', 'bailbondsmen', 'bakeries', 'bangladeshi', 'barcrawl', 'bars', 'basque', 'batterystores', 'bavarian', 'barbeque', 'beerwinespirits', 'beergarden', 'beergardens', 'beerhall', 'beisl', 'belgian', 'bento', 'bespokeclothing', 'beveragestore', 'bikerepairmaintenance', 'biohazardcleanup', 'bistros', 'blacksea', 'shadesblinds', 'bookbinding', 'brasseries', 'brazilian', 'breakfastbrunch', 'breweries', 'brewingsupplies', 'bridal', 'british', 'bubbletea', 'buffets', 'buildingsupplies', 'bulgarian', 'burgers', 'burmese', 'busrental', 'butcher', 'cabinetry', 'cafes', 'cafeteria', 'cajuncreole', 'patisseriecakeshop', 'calligraphy', 'cambodian', 'cannabisdispensaries', 'canteen', 'caribbean', 'carpenters', 'carpetcleaning', 'carpetdyeing', 'carpeting', 'carpetinstallation', 'catalan', 'mobilephoneaccessories', 'cheesesteaks', 'chickenwings', 'chickenshop', 'childcaredaycare', 'childproofing', 'chilean', 'chimneycakes', 'chimneysweeps', 'chinese', 'chinesebazaar', 'churros', 'cideries', 'clockrepair', 'clubcrawl', 'coffeetea', 'coffeeroasteries', 'coffeeshops', 'coffeeteasupplies', 'comedyclubs', 'comfortfood', 'communitybookbox', 'communitygardens', 'computers', 'conceptshops', 'contractors', 'conveniencestores', 'printingservices', 'corsican', 'countertopinstallation', 'countrydancehalls', 'couriersdeliveryservices', 'craneservices', 'creperies', 'csa', 'cuban', 'cupcakes', 'currysausage', 'customcakes', 'customizedmerchandise', 'cypriot', 'czech', 'czechslovakian', 'damagerestoration', 'danceclubs', 'dancerestaurants', 'danish', 'decksrailing', 'delicatessen', 'delis', 'demolitionservices', 'desserts', 'diamondbuyers', 'diners', 'dinnertheater', 'discountstore', 'distilleries', 'doityourselffood', 'donairs', 'donationcenter', 'donuts', 'doorsalesinstallation', 'drones', 'drugstores', 'drywallinstallationrepair', 'dumplings', 'dutyfreeshops', 'easterneuropean', 'eldercareplanning', 'electricians', 'electronics', 'electronicsrepair', 'elevatorservices', 'empanadas', 'engraving', 'environmentalabatement', 'environmentaltesting', 'eritrean', 'ethicalgrocery', 'ethiopian', 'excavationservices', 'farmequipmentrepair', 'farmersmarket', 'farmingequipment', 'fashion', 'fasilmusic', 'fencesgates', 'filipino', 'fingerprinting', 'fireplaceservices', 'fireprotectionservices', 'firewood', 'fireworks', 'fischbroetchen', 'fishmonger', 'fishchips', 'fitnessexerciseequipment', 'flatbread', 'fleamarkets', 'flooring', 'flowersgifts', 'fondue', 'foodcourt', 'fooddeliveryservices', 'foodstands', 'foodtrucks', 'forestry', 'foundationrepair', 'freiduria', 'french', 'friterie', 'funeralservicescemeteries', 'furnitureassembly', 'furniturerepair', 'galician', 'gamemeat', 'garagedoorservices', 'gardeners', 'gastropubs', 'gelato', 'gemstonesminerals', 'generatorinstallationrepair', 'georgian', 'german', 'gestorias', 'giblets', 'glassmirrors', 'mulledwine', 'glutenfree', 'goldbuyers', 'specialtyfood', 'greek', 'grillservices', 'grocery', 'groutservices', 'guamanian', 'gunsammo', 'gunsmith', 'gutterservices', 'halal', 'handyman', 'hawaiian', 'hawkercentre', 'hazardouswastedisposal', 'headshops', 'heuriger', 'highfidelityaudioequipment', 'himalayannepalese', 'hongkongstylecafe', 'hobbyshops', 'homeinspectors', 'homeorganization', 'homegarden', 'appliancesrepair', 'homeautomation', 'homecleaning', 'homeenergyauditors', 'homenetworkinstallation', 'hometheatreinstallation', 'homewindowtinting', 'honduran', 'honey', 'horseequipmentshops', 'hotdogs', 'fastfood', 'hotpot', 'housesitters', 'hungarian', 'heatingairconditioninghvac', 'hydrojetting', 'iberian', 'icecreamfrozenyogurt', 'icedelivery', 'importedfood', 'indonesian', 'indian', 'insulationinstallation', 'interiordesign', 'international', 'internetboothscallingcenters', 'internetcafes', 'internationalgrocery', 'irish', 'irrigation', 'islandpub', 'israeli', 'italian', 'itservicescomputerrepair', 'japanese', 'jewelry', 'jewelryrepair', 'jewish', 'japanesesweets', 'juicebarssmoothies', 'junkremovalhauling', 'junkyards', 'karaoke', 'kebab', 'knifesharpening', 'knittingsupplies', 'kombucha', 'kopitiam', 'korean', 'kosher', 'kurdish', 'landscapearchitects', 'landscaping', 'laos', 'laotian', 'latinamerican', 'laundryservices', 'lightingfixturesequipment', 'livestockfeedsupply', 'keyslocksmiths', 'luggage', 'lyonnais', 'machinetoolrental', 'machineshops', 'mailboxcenters', 'malaysian', 'marketstalls', 'masonryconcrete', 'meaderies', 'meatballs', 'booksmagsmusicvideo', 'medicalsupplies', 'mediterranean', 'metaldetectorservices', 'metalfabricators', 'mexican', 'middleeastern', 'militarysurplus', 'milkbars', 'milkshakebars', 'mistingsystemservices', 'mobilehomerepair', 'mobilephones', 'modernaustralian', 'moderneuropean', 'mongolian', 'moroccan', 'motorcyclegear', 'movers', 'musicalinstrumentsteachers', 'musicalinstrumentservices', 'nannyservices', 'nasilemak', 'americannew', 'canadiannew', 'newmexicancuisine', 'newzealand', 'nicaraguan', 'nightfood', 'nikkei', 'communityservicenonprofit', 'noodles', 'norcinerie', 'traditionalnorwegian', 'notaries', 'officeequipment', 'opensandwiches', 'eyewearopticians', 'organicstores', 'oriental', 'outdoorpowerequipmentservices', 'outletstores', 'packingservices', 'packingsupplies', 'painters', 'pakistani', 'panasian', 'panzerotti', 'parma', 'patiocoverings', 'pawnshops', 'persianiranian', 'personalshopping', 'peruvian', 'pestcontrol', 'pfcomercial', 'photographystoresservices', 'piadina', 'pianobars', 'pita', 'pizza', 'plumbing', 'poke', 'polish', 'polynesian', 'poolbilliards', 'poolcleaners', 'poolhalls', 'poolhottubservice', 'popuprestaurants', 'popupshops', 'portabletoiletservices', 'portuguese', 'potatoes', 'poutineries', 'powdercoating', 'pressurewashers', 'pretzels', 'propane', 'props', 'pubfood', 'publicmarkets', 'liverawfood', 'realestate', 'recordlabels', 'recordingrehearsalstudios', 'recyclingcenter', 'refinishingservices', 'religiousitems', 'furniturerental', 'furniturereupholstery', 'rice', 'romanian', 'roofing', 'roofinspectors', 'rotisseriechicken', 'russian', 'safestores', 'safetyequipment', 'salad', 'salumerie', 'sandblasting', 'sandwiches', 'saunainstallationrepair', 'scandinavian', 'scandinaviandesign', 'schnitzel', 'scottish', 'screenprintingtshirtprinting', 'screenprinting', 'seafood', 'holidaydecoratingservices', 'securitysystems', 'selfstorage', 'septicservices', 'serbocroatian', 'sewingalterations', 'shavedice', 'shavedsnow', 'shippingcenters', 'shoerepair', 'shoeshine', 'shoppingcenters', 'shoppingpassages', 'shutters', 'signaturecuisine', 'singaporean', 'slovakian', 'smokehouse', 'smokingareas', 'snowremoval', 'snuggleservices', 'solarinstallation', 'solarpanelcleaning', 'somali', 'soulfood', 'soup', 'southern', 'souvenirshops', 'spanish', 'spiritualshop', 'sportinggoods', 'srilankan', 'steakhouses', 'stonemasons', 'streetvendors', 'structuralengineers', 'stuccoservices', 'frenchsouthwest', 'sugarshacks', 'supperclubs', 'sushibars', 'swabian', 'swedish', 'swissfood', 'syrian', 'tabernas', 'tabletopgames', 'taiwanese', 'tapasbars', 'tapassmallplates', 'tavolacalda', 'tearooms', 'teachersupplies', 'televisionserviceproviders', 'texmex', 'thai', 'thriftstores', 'tickets', 'tiling', 'tobaccoshops', 'torshi', 'tortillas', 'toystores', 'americantraditional', 'traditionalswedish', 'trattorie', 'treeservices', 'trophyshops', 'turkish', 'tvmounting', 'ukrainian', 'uniforms', 'usedbookstore', 'utilities', 'uzbek', 'vapeshops', 'vegan', 'vegetarian', 'venison', 'vietnamese', 'siding', 'vitaminssupplements', 'waffles', 'wallpapering', 'watchrepair', 'watches', 'waterdelivery', 'waterheaterinstallationrepair', 'waterproofing', 'waterpurificationservices', 'waterstores', 'welldrilling', 'wholesalestores', 'wigs', 'wildlifecontrol', 'windowsinstallation', 'windowwashing', 'wok', 'wraps', 'youthclub', 'yugoslav', 'zapiekanka']
level3 = []

In [0]:
# Plot the values of the most popular categories
label, value = zip(*sorted_list)
# get top categories
labels = np.array(label[0:9])
values = np.array(value[0:9])

indexes = np.arange(len(labels))

bar_width = 0.1

plt.bar(labels, values)

# add labels
plt.xticks(rotation = 90, fontsize = 14)
#plt.xticks(indexes + bar_width) #, labels)
plt.title('Level 1 Categories', fontsize = 20)
plt.ylabel('Number of category occurrences', fontsize = 16)
plt.xlabel('Category Label', fontsize = 16)
plt.show()

Bars is #10 on count but not a root
Fashion # on count but not a root
coffeetea # on count but not a root
sandwiches # on count but not a root
hairsalons # on count but not a root
fastfood # on count but not a root
americantraditional
pizza
homegarden
autorepair
doctors



In [0]:
for item in level1:
  if item in sorted_distribution:
    sorted_distribution.pop(item)
    
print('Size of distribution list after removing roots ', len(sorted_distribution))
print(sorted_distribution)

In [0]:
for item in level2:
  if item in sorted_distribution:
    sorted_distribution.pop(item)
print('Size of distribution list after removing level 2 ', len(sorted_distribution))
print(sorted_distribution)