In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## First Pass

In [2]:
keyword_df = pd.read_pickle('../data/keywords.pk')


In [5]:
keyword_list_orig = []
for kwl in keyword_df.Keywords:
    keyword_list_orig+=list(kwl)

keyword_list_orig_nr = list(set(keyword_list_orig))

In [6]:
def region_check(word):
    for ending in ['ian','ino','can','lan']:
        if word[-3:]==ending:
            return True
    if word[-3:]=='ish' and word[-4]!='f':
        return True
    elif word[-3:]=='ese' and word[-4]!='e':
        return True
    elif 'west'in word:
        return True
    return False

regions = [w for w in keyword_list_orig_nr if region_check(w)]
regions+=['thai', 'new zealand','iraqi', 'creole', 'welsh','szechuan', 'hunan','korean',
          'european', 'icelandic', 'pennsylvania dutch','pakistani', 'honduran', 'dutch',
          'ecuadorean','chilean','czech','swiss','cuban', 'tex mex','greek','caribbean',
          'cajun']

In [5]:
len([r for r in keyword_df.index if 'english' in keyword_df['Name'][r]])

174

Most of the region keywords indicate country of origin, while others only indicate continent. To make everything consistent, we will make sure that whenever a country appears as a keyword, the continent that contains it also appears as a keyword. 

Note that a few continents don't appear as keywords in the original dataframe; we will add these to the list of keywords.

In [7]:
newkeywords = []

missing_regions = ['english','british','american','french']

for r in keyword_df.index:
    newkeywords.append(keyword_df['Keywords'][r]+[w for w in missing_regions if w in keyword_df['Name'][r]])

keyword_df['Keywords_ext'] = newkeywords

In [8]:
regions+=missing_regions

In [9]:
keyword_df['RegionList'] = [list(set(regions).intersection(set(kws))) for kws in keyword_df.Keywords]

In [10]:
def region_vec(rkwl:list)->list:
    vec = [0]*len(regions)
    for i in range(len(regions)):
        if regions[i] in rkwl:
            vec[i]+=1
    return vec


In [11]:
italian_words= ['italy','lasagna','spaghetti','pasta','pizza','pesto','bruschetta','caprese','penne','linguini','alfredo','bolognese',
                     'risotto','gnocchi','tuscan','lombardi','ziti','tiramisu','tetrazzini','osso bucco','milanese','cacciatore',
                     'sicilian','biscotti','focaccia','tortellini','stromboli','mozzarella','calzone','cavatelli','marinara','orecchiette',
                     'limoncello','piccata','manicotti','pomodoro','crostini','ravioli', 'mostaccioli','saltimbocca','fettuccini','linguine',
                     'minestrone','parmigian','scarpariello']
chinese_words = ['chang']
english_words = ['bubble and squeak','shepherd','yorkshire','devonshire','faggots','trifle','meat pie']
irish_words = ['irish','dublin','guinness','blarney','st. patrick','st patty','st. patty']
german_words = ['german','schnitzel','wurst','suppe','barvarian','bavarian','spaetzle','schwein','sauer','rahmapfelkuchen',
                'tropfkrapfen','kartoffelpfannkuchen','rindergulasch','gedunstet',' mit ']
french_words = ['creme brulee','boulangere','bourguignonne','escargot','hollandaise','bouillabaisse','burgundy','parisienne','ratatouille',
                'crepe','gratin','croute','lyonnaise','de chou','quiche','en bleu']
indian_words = ['marsala','tandoori','chutney']
jamaican_words = ['jamaica']
american_words = ['n. y. c.','u.s.','new york','new england','amish']
russian_words = ['russia','stroganoff']
ukrainian_words = ['ukrainian']
austrian_words = ['austrian','wolfgang puck']
caribbean_words = ['caribbean']

def classify_reg(name):
    poss_regs = []
    for w in italian_words:
        if w in name:
            poss_regs.append('italian')
    for w in chinese_words:
        if w in name:
            poss_regs.append('chinese')
    for w in english_words:
        if w in name:
            poss_regs.append('english')
    for w in irish_words:
        if w in name:
            poss_regs.append('irish')
    for w in german_words:
        if w in name:
            poss_regs.append('german')
    for w in french_words:
        if w in name:
            poss_regs.append('french')
    for w in indian_words:
        if w in name:
            poss_regs.append('indian')
    for w in jamaican_words:
        if w in name:
            poss_regs.append('jamaican')
    for w in american_words:
        if w in name:
            poss_regs.append('usa')
    for w in ukrainian_words:
        if w in name:
            poss_regs.append('ukrainian')
    for w in russian_words:
        if w in name:
            poss_regs.append('russian')
    for w in austrian_words:
        if w in name:
            poss_regs.append('austrian')
    for w in caribbean_words:
        if w in name:
            poss_regs.append('caribbean')
    return list(set(poss_regs))

In [13]:
keywords_ext = []
for rec in keyword_df.index:
    regs0 = keyword_df['RegionList'][rec]
    keywords_ext.append(list(set(regs0+classify_reg(keyword_df['Name'][rec]))))

keyword_df['RegionKeywords']=keywords_ext

region_df = keyword_df.loc[keyword_df['RegionKeywords'].apply(len)>0][['Name','RegionKeywords']].copy()

In [14]:
region_df

Unnamed: 0_level_0,Name,RegionKeywords
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1
49,chicken breasts lombardi,"[italian, european]"
55,betty crocker's southwestern guacamole dip,"[mexican, southwestern u.s.]"
58,low-fat burgundy beef & vegetable stew,[french]
59,lou's fabulous bruschetta,"[italian, european]"
62,"black bean, corn, and tomato salad",[polish]
...,...,...
539115,dutch fried potatoes (gebakken aardappelen),"[dutch, european]"
539152,stir fried noodles (mie goreng),"[asian, indonesian, dutch, european]"
539182,cinnamony sweet moroccan orange salad,"[moroccan, african]"
539184,belgian chocolate-fudge sauce,"[belgian, european]"


In [15]:
by_continent = {}

by_continent['north american'] = ['mexican','canadian', 'guatemalan', 'southwestern u.s.','creole', 'pennsylvania dutch','honduran',
 'tex mex', 'native american','costa rican','cajun','usa','american']
by_continent['south american'] = ['brazilian','ecuadorean','chilean','peruvian','colombian','venezuelan']
by_continent['european'] = ['finnish','swedish','scandinavian','russian','spanish','portuguese', 'polish', 'georgian','danish','norwegian',
                            'belgian','welsh','icelandic','dutch','czech','swiss','hungarian','scottish','turkish','austrian','italian','greek',
                            'french','english','british','irish','german','ukrainian']
by_continent['asian'] = ['cambodian','malaysian','indonesian','southwest asia (middle east)','filipino','chinese','indian','vietnamese','lebanese',
                         'japanese','cantonese','thai','iraqi','szechuan','hunan', 'korean','pakistani','turkish','nepalese','mongolian','palestinian']
by_continent['african'] = ['nigerian','somalian','south african','moroccan','egyptian','sudanese','ethiopian']
by_continent['oceania/islands'] = ['polynesian','hawaiian','new zealand','australian','caribbean','cuban','puerto rican','jamaican']

continents = list(by_continent.keys())
countries = []
for c in continents:
    countries+=by_continent[c]

countries = list(set(countries))

country_to_continents = {country:[] for country in countries}
for continent in continents:
    for country in by_continent[continent]:
        country_to_continents[country].append(continent)

def get_continents(rkws:list)->list:
    conts0 = list(set(continents).intersection(rkws))
    for country in set(rkws).intersection(set(countries)):
        conts0+=country_to_continents[country]
    conts0 = list(set(conts0))
    return conts0


In [16]:
region_df

Unnamed: 0_level_0,Name,RegionKeywords
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1
49,chicken breasts lombardi,"[italian, european]"
55,betty crocker's southwestern guacamole dip,"[mexican, southwestern u.s.]"
58,low-fat burgundy beef & vegetable stew,[french]
59,lou's fabulous bruschetta,"[italian, european]"
62,"black bean, corn, and tomato salad",[polish]
...,...,...
539115,dutch fried potatoes (gebakken aardappelen),"[dutch, european]"
539152,stir fried noodles (mie goreng),"[asian, indonesian, dutch, european]"
539182,cinnamony sweet moroccan orange salad,"[moroccan, african]"
539184,belgian chocolate-fudge sauce,"[belgian, european]"


In [17]:
region_df['Regions'] = [[w for w in region_df['RegionKeywords'][rec] if w not in continents] for rec in region_df.index]

In [18]:
region_df.to_pickle('../data/clean_columns/regiondatav0.pk')

## Second Pass
We're going to add regions - something between countries and continents.

In [4]:
regions_df = pd.read_pickle('../data/regiondatav0.pk')


In [40]:
country_list = []
for cl in region_df.Regions:
    country_list+=list(cl)
country_list = list(set(country_list))

In [38]:
by_region= {}
by_region['us/canada'] = ['canadian','pennsylvania dutch','native american', 'tex mex', 'southwestern u.s.','creole','cajun','american','usa']
by_region['central america'] = ['honduran','guatemalan','mexican','costa rican']
by_region['caribbean'] = ['cuban','puerto rican','jamaican']
by_region['scandinavian'] = ['finnish','swedish','danish','icelandic','norwegian']
by_region['british'] = ['scottish','welsh','english','irish']
by_region['eastern europe'] = ['russian','ukrainian','georgian','polish']
by_region['central europe'] = ['german','dutch','austrian','swiss','belgian','czech','hungarian']
by_region['arab'] = ['southwest asia (middle east)','lebanese','palestinian','egyptian','moroccan','iraqi']
by_region['mediterranean'] = ['italian','greek','turkish','moroccan','egyptian','spanish','french','portuguese']
by_region['east asia'] = ['japanese','cantonese','chinese','hunan', 'nepalese','korean','szechuan','vietnamese','thai','filipino','mongolian', 'cambodian']
by_region['sotheast asia'] = ['malaysian','indonesian','indian','pakistani']
by_region['subsaharan africa'] = ['somalian','south african','nigerian','ethiopian','sudanese']
by_region['south america'] = ['colombian', 'chilean', 'peruvian', 'ecuadorean', 'venezuelan', 'brazilian']
by_region['pacific island'] = ['australian','polynesian','hawaiian','new zealand','japanese','thai','filipino']

country_to_regions = {region:[region] for region in by_region}
for region in by_region:
    for country in by_region[region]:
        if country in country_to_regions:
            country_to_regions[country].append(region)
        else:
            country_to_regions[country] = [region]

In [41]:
region_df['Countries'] = region_df['Regions']

In [44]:
regions= []
for countrylist in region_df.Countries:
    regionlist = []
    for country in countrylist:
        regionlist+=country_to_regions[country]
    regions.append(list(set(regionlist)))
region_df['Regions']=regions

In [35]:
regions_df[['Name','Continents','Regions','Countries']].to_pickle('../data/regiondatav1.pk')

In [45]:
region_df

Unnamed: 0_level_0,Name,RegionKeywords,Regions,Countries
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
49,chicken breasts lombardi,"[italian, european]",[mediterranean],[italian]
55,betty crocker's southwestern guacamole dip,"[southwestern u.s., mexican]","[us/canada, central america]","[southwestern u.s., mexican]"
58,low-fat burgundy beef & vegetable stew,[french],[mediterranean],[french]
59,lou's fabulous bruschetta,"[italian, european]",[mediterranean],[italian]
62,"black bean, corn, and tomato salad",[polish],[eastern europe],[polish]
...,...,...,...,...
539115,dutch fried potatoes (gebakken aardappelen),"[dutch, european]",[central europe],[dutch]
539152,stir fried noodles (mie goreng),"[dutch, asian, european, indonesian]","[sotheast asia, central europe]","[dutch, indonesian]"
539182,cinnamony sweet moroccan orange salad,"[african, moroccan]","[mediterranean, arab]",[moroccan]
539184,belgian chocolate-fudge sauce,"[european, belgian]",[central europe],[belgian]
