### Imagine
you are an instant noodle lover who works in a food company.

Your boss has tasked you to analyze the existing instant noodles out there and sugget a good flavour to sell.

Realizing that you can put your Python skills to good use, you decide to use the Rame Rater's dataset and train a machine learning model to predict the best noodle flavour.

# Part 3: Engineering Features and Preparing Data

In [1]:
# Import Libraries
import pandas as pd

In [2]:
# Read CSV from Part 2
df = pd.read_csv("thebiglist_clean_extended.csv")

In [3]:
df

Unnamed: 0,Brand,Variety,Style,Country,Stars,is_spicy,has_chicken,has_beef,has_seafood,is_veg,has_pork
0,Higashimaru,Seafood Sara Udon,Pack,Japan,5.00,0,0,0,1,0,0
1,Single Grain,Chongqing Spicy & Sour Rice Noodles,Cup,China,3.50,1,0,0,0,0,0
2,Sau Tao,Seafood Flavour Sichuan Spicy Noodle,Pack,Hong Kong,5.00,1,0,0,1,0,0
3,Sau Tao,Jiangnan Style Noodle - Original Flavour,Pack,Hong Kong,4.50,0,0,0,0,0,0
4,Sapporo Ichiban,CupStar Shio Ramen,Cup,Japan,3.50,0,0,0,0,0,0
5,Sichuan Baijia,Big Boss Broad Noodle Chili Oil Flavor (Sour &...,Cup,China,4.50,1,0,0,0,0,0
6,Nissin,Top Ramen Masala Noodles,Pack,India,4.00,0,0,0,0,0,0
7,Maruchan,Miyashi Chuka Cold Noodle,Pack,Japan,5.00,0,0,0,0,0,0
8,Yamamoto Seifun,Tanukioyaji Super Spicy Mazemen,Bowl,Japan,3.50,1,0,0,0,0,0
9,Kenko Foods,Michio Kawamura Nature Ramen Shio,Pack,Japan,3.75,0,0,0,0,0,0


In [10]:
# Create a new brand feature which shortens the number of brands
# So we take the top 30 brand
brand_30 = df["Brand"].value_counts()[:30].index  # index puts all the brands into an array
brand_30

Index(['Nissin', 'Maruchan', 'Nongshim', 'Myojo', 'Samyang Foods', 'Paldo',
       'Mama', 'Sapporo Ichiban', 'Indomie', 'Ottogi', 'Sau Tao', 'Acecook',
       'KOKA', 'Maggi', 'Vifon', 'MyKuali', 'Vina Acecook', 'Lucky Me!',
       'Mamee', 'MAMA', 'Ve Wong', 'Master Kong', 'Vedan', 'JML', 'Wei Lih',
       'Wai Wai', 'A-Sha Dry Noodle', 'Wu-Mu', 'Yum Yum', 'Itsuki'],
      dtype='object')

In [12]:
# Create a new column called newBrand
# For any brand not in the top 30: just place it under Other
newBrand = []

for brand in df["Brand"]:
    if brand in brand_30:
        newBrand.append(brand)
    else:
        newBrand.append("Other")
        
df["newBrand"] = newBrand
df

Unnamed: 0,Brand,Variety,Style,Country,Stars,is_spicy,has_chicken,has_beef,has_seafood,is_veg,has_pork,newBrand
0,Higashimaru,Seafood Sara Udon,Pack,Japan,5.00,0,0,0,1,0,0,Other
1,Single Grain,Chongqing Spicy & Sour Rice Noodles,Cup,China,3.50,1,0,0,0,0,0,Other
2,Sau Tao,Seafood Flavour Sichuan Spicy Noodle,Pack,Hong Kong,5.00,1,0,0,1,0,0,Sau Tao
3,Sau Tao,Jiangnan Style Noodle - Original Flavour,Pack,Hong Kong,4.50,0,0,0,0,0,0,Sau Tao
4,Sapporo Ichiban,CupStar Shio Ramen,Cup,Japan,3.50,0,0,0,0,0,0,Sapporo Ichiban
5,Sichuan Baijia,Big Boss Broad Noodle Chili Oil Flavor (Sour &...,Cup,China,4.50,1,0,0,0,0,0,Other
6,Nissin,Top Ramen Masala Noodles,Pack,India,4.00,0,0,0,0,0,0,Nissin
7,Maruchan,Miyashi Chuka Cold Noodle,Pack,Japan,5.00,0,0,0,0,0,0,Maruchan
8,Yamamoto Seifun,Tanukioyaji Super Spicy Mazemen,Bowl,Japan,3.50,1,0,0,0,0,0,Other
9,Kenko Foods,Michio Kawamura Nature Ramen Shio,Pack,Japan,3.75,0,0,0,0,0,0,Other


In [16]:
df["newBrand"].value_counts()

Other               1781
Nissin               476
Maruchan             131
Nongshim             119
Myojo                111
Samyang Foods        102
Paldo                 84
Mama                  71
Sapporo Ichiban       69
Indomie               56
Ottogi                50
Acecook               48
Sau Tao               48
KOKA                  39
Maggi                 38
Vifon                 36
MyKuali               35
Vina Acecook          34
Lucky Me!             34
Mamee                 34
MAMA                  33
Ve Wong               32
Master Kong           29
Vedan                 28
JML                   28
Wei Lih               28
Wai Wai               26
A-Sha Dry Noodle      26
Wu-Mu                 23
Yum Yum               23
Itsuki                20
Name: newBrand, dtype: int64

In [17]:
# Now we look if there is a need to do smth similar to style
df["Style"].value_counts()   

Pack          2085
Bowl           722
Cup            659
Tray           167
Box             54
Restaurant       3
Can              1
Bar              1
Name: Style, dtype: int64

We can prob do without the last 4

In [24]:
style_4 = df["Style"].value_counts()[:4].index

newStyle = []

for style in df["Style"]:
    if style in style_4:
        newStyle.append(style)
    else:
        newStyle.append("Others")

df["newStyle"] = newStyle
df

Unnamed: 0,Brand,Variety,Style,Country,Stars,is_spicy,has_chicken,has_beef,has_seafood,is_veg,has_pork,newBrand,newStyle
0,Higashimaru,Seafood Sara Udon,Pack,Japan,5.00,0,0,0,1,0,0,Other,Pack
1,Single Grain,Chongqing Spicy & Sour Rice Noodles,Cup,China,3.50,1,0,0,0,0,0,Other,Cup
2,Sau Tao,Seafood Flavour Sichuan Spicy Noodle,Pack,Hong Kong,5.00,1,0,0,1,0,0,Sau Tao,Pack
3,Sau Tao,Jiangnan Style Noodle - Original Flavour,Pack,Hong Kong,4.50,0,0,0,0,0,0,Sau Tao,Pack
4,Sapporo Ichiban,CupStar Shio Ramen,Cup,Japan,3.50,0,0,0,0,0,0,Sapporo Ichiban,Cup
5,Sichuan Baijia,Big Boss Broad Noodle Chili Oil Flavor (Sour &...,Cup,China,4.50,1,0,0,0,0,0,Other,Cup
6,Nissin,Top Ramen Masala Noodles,Pack,India,4.00,0,0,0,0,0,0,Nissin,Pack
7,Maruchan,Miyashi Chuka Cold Noodle,Pack,Japan,5.00,0,0,0,0,0,0,Maruchan,Pack
8,Yamamoto Seifun,Tanukioyaji Super Spicy Mazemen,Bowl,Japan,3.50,1,0,0,0,0,0,Other,Bowl
9,Kenko Foods,Michio Kawamura Nature Ramen Shio,Pack,Japan,3.75,0,0,0,0,0,0,Other,Pack


In [25]:
df["newStyle"].value_counts()

Pack      2085
Bowl       722
Cup        659
Tray       167
Others      59
Name: newStyle, dtype: int64

In [30]:
# Ok once more, but now with country
df["Country"].value_counts()

Japan                 681
United States         458
South Korea           411
Taiwan                372
China                 245
Thailand              212
Malaysia              207
Hong Kong             191
Indonesia             161
Singapore             140
Vietnam               124
UK                     75
Canada                 56
Philippines            51
India                  45
Mexico                 32
Germany                28
Australia              25
Brazil                 24
Netherlands            16
Nepal                  14
Myanmar                14
Bangladesh             12
Pakistan                9
Hungary                 9
France                  6
Poland                  6
Colombia                6
Sarawak                 5
Russia                  5
Cambodia                5
Holland                 4
Peru                    4
Fiji                    4
Italy                   4
Ukraine                 3
Sweden                  3
Spain                   3
Israel      

We can probably remove all starting from UK

In [31]:
country_10 = df["Country"].value_counts()[:11].index

newCountry = []

for country in df["Country"]:
    if country in country_10:
        newCountry.append(country)
        
    else: newCountry.append("Others")
        
df["newCountry"] = newCountry
df

Unnamed: 0,Brand,Variety,Style,Country,Stars,is_spicy,has_chicken,has_beef,has_seafood,is_veg,has_pork,newBrand,newStyle,newCountry
0,Higashimaru,Seafood Sara Udon,Pack,Japan,5.00,0,0,0,1,0,0,Other,Pack,Japan
1,Single Grain,Chongqing Spicy & Sour Rice Noodles,Cup,China,3.50,1,0,0,0,0,0,Other,Cup,China
2,Sau Tao,Seafood Flavour Sichuan Spicy Noodle,Pack,Hong Kong,5.00,1,0,0,1,0,0,Sau Tao,Pack,Hong Kong
3,Sau Tao,Jiangnan Style Noodle - Original Flavour,Pack,Hong Kong,4.50,0,0,0,0,0,0,Sau Tao,Pack,Hong Kong
4,Sapporo Ichiban,CupStar Shio Ramen,Cup,Japan,3.50,0,0,0,0,0,0,Sapporo Ichiban,Cup,Japan
5,Sichuan Baijia,Big Boss Broad Noodle Chili Oil Flavor (Sour &...,Cup,China,4.50,1,0,0,0,0,0,Other,Cup,China
6,Nissin,Top Ramen Masala Noodles,Pack,India,4.00,0,0,0,0,0,0,Nissin,Pack,Others
7,Maruchan,Miyashi Chuka Cold Noodle,Pack,Japan,5.00,0,0,0,0,0,0,Maruchan,Pack,Japan
8,Yamamoto Seifun,Tanukioyaji Super Spicy Mazemen,Bowl,Japan,3.50,1,0,0,0,0,0,Other,Bowl,Japan
9,Kenko Foods,Michio Kawamura Nature Ramen Shio,Pack,Japan,3.75,0,0,0,0,0,0,Other,Pack,Japan


In [33]:
df["newCountry"].value_counts()

Japan            681
Others           490
United States    458
South Korea      411
Taiwan           372
China            245
Thailand         212
Malaysia         207
Hong Kong        191
Indonesia        161
Singapore        140
Vietnam          124
Name: newCountry, dtype: int64

### Creating Dummies and Final Dataframe

Basically dummy will turn the features into multiple 0/1 features, which we can then add into a final dataframe

In [37]:
countryDummy = pd.get_dummies(df["newCountry"],drop_first= True,
                             prefix = "from")

In [38]:
brandDummy = pd.get_dummies(df["newBrand"],drop_first= True,
                             prefix = "from")

In [44]:
styleDummy = pd.get_dummies(df["newStyle"],drop_first= True,
                             prefix = "is")

In [41]:
df_temp = df.drop(["Brand", "Country","Style","Variety",
                  "newBrand","newCountry","newStyle"], axis=1)

In [42]:
df_temp

Unnamed: 0,Stars,is_spicy,has_chicken,has_beef,has_seafood,is_veg,has_pork
0,5.00,0,0,0,1,0,0
1,3.50,1,0,0,0,0,0
2,5.00,1,0,0,1,0,0
3,4.50,0,0,0,0,0,0
4,3.50,0,0,0,0,0,0
5,4.50,1,0,0,0,0,0
6,4.00,0,0,0,0,0,0
7,5.00,0,0,0,0,0,0
8,3.50,1,0,0,0,0,0
9,3.75,0,0,0,0,0,0


In [48]:
df_final = pd.concat([df_temp, brandDummy, styleDummy, countryDummy],axis = 1)

In [49]:
df_final

Unnamed: 0,Stars,is_spicy,has_chicken,has_beef,has_seafood,is_veg,has_pork,from_Acecook,from_Indomie,from_Itsuki,...,from_Indonesia,from_Japan,from_Malaysia,from_Others,from_Singapore,from_South Korea,from_Taiwan,from_Thailand,from_United States,from_Vietnam
0,5.00,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,3.50,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5.00,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4.50,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.50,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
5,4.50,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,4.00,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7,5.00,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,3.50,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9,3.75,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [52]:
df_final.to_csv("final_biglist.csv",index=None)

In [53]:
pd.read_csv("final_biglist.csv")

Unnamed: 0,Stars,is_spicy,has_chicken,has_beef,has_seafood,is_veg,has_pork,from_Acecook,from_Indomie,from_Itsuki,...,from_Indonesia,from_Japan,from_Malaysia,from_Others,from_Singapore,from_South Korea,from_Taiwan,from_Thailand,from_United States,from_Vietnam
0,5.00,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,3.50,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5.00,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4.50,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.50,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
5,4.50,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,4.00,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7,5.00,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,3.50,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9,3.75,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
