### Imagine
you are an instant noodle lover who works in a food company.

Your boss has tasked you to analyze the existing instant noodles out there and sugget a good flavour to sell.

Realizing that you can put your Python skills to good use, you decide to use the Rame Rater's dataset and train a machine learning model to predict the best noodle flavour.

# Part 4: Training a Machine Learning Model

In [20]:
# Import Libraries
import pandas as pd

# Import the machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Import Matrix
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = pd.read_csv("final_biglist.csv")

In [3]:
df

Unnamed: 0,Stars,is_spicy,has_chicken,has_beef,has_seafood,is_veg,has_pork,from_Acecook,from_Indomie,from_Itsuki,...,from_Indonesia,from_Japan,from_Malaysia,from_Others,from_Singapore,from_South Korea,from_Taiwan,from_Thailand,from_United States,from_Vietnam
0,5.00,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,3.50,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5.00,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4.50,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.50,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
5,4.50,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,4.00,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7,5.00,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,3.50,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9,3.75,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [4]:
# Let us explore our stars
df["Stars"].value_counts()

# As you can see- there is a need to group our stars together

5.000    667
3.500    543
3.750    535
4.000    439
4.500    243
3.250    222
4.250    217
3.000    172
2.750    110
2.000     95
2.500     95
4.750     93
1.500     50
0.000     40
1.000     38
2.250     29
1.750     28
0.500     20
0.250     17
1.250     14
1.100      2
3.100      2
2.900      2
0.750      2
2.800      2
4.125      2
2.850      1
2.300      1
3.600      1
3.700      1
3.650      1
0.100      1
3.400      1
3.125      1
1.800      1
3.200      1
2.100      1
2.125      1
0.900      1
Name: Stars, dtype: int64

In [6]:
df["Stars"].median()  # Thus halfway point is 3.75

3.75

Therefore, we can make it such that noodles are not tasty <= 3.75 and noodles are tasty > 3.75. We could do this using a for loop, but we can also do this by using a pandas function.

In [7]:
pd.qcut(df["Stars"], q = 2, labels = [0,1])  
# q asks for how many groups we wish to create
# and lables is what we wish each group to be called

0       1
1       0
2       1
3       1
4       0
5       1
6       1
7       1
8       0
9       0
10      0
11      1
12      1
13      1
14      1
15      1
16      0
17      1
18      1
19      1
20      1
21      1
22      0
23      0
24      1
25      1
26      0
27      1
28      1
29      1
       ..
3662    0
3663    0
3664    1
3665    1
3666    1
3667    0
3668    0
3669    1
3670    0
3671    1
3672    0
3673    1
3674    0
3675    0
3676    0
3677    0
3678    0
3679    1
3680    0
3681    0
3682    0
3683    0
3684    0
3685    0
3686    0
3687    0
3688    0
3689    0
3690    0
3691    0
Name: Stars, Length: 3692, dtype: category
Categories (2, int64): [0 < 1]

The function automatically finds the median and gives a 0 for stars <= 3.75 and a 1 for stars> 3.75

In [40]:
# Thus, let us define a new column
df["binStars"] = pd.qcut(df["Stars"], q = 2, labels = [0,1])  

In [41]:
df

Unnamed: 0,Stars,is_spicy,has_chicken,has_beef,has_seafood,is_veg,has_pork,from_Acecook,from_Indomie,from_Itsuki,...,from_Japan,from_Malaysia,from_Others,from_Singapore,from_South Korea,from_Taiwan,from_Thailand,from_United States,from_Vietnam,binStars
0,5.00,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,3.50,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5.00,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4.50,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,3.50,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,4.50,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,4.00,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
7,5.00,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
8,3.50,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,3.75,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [42]:
df["binStars"].value_counts()

0    2031
1    1661
Name: binStars, dtype: int64

In [43]:
# Prepare independent and dependent variables
X = df.drop(["Stars","binStars"], axis = 1)
y = df["binStars"]

In [44]:
# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   test_size = 0.2,   # 20%
                                                   stratify = y)      # need to ensure equal ration of 0 and 1 in both test and train

## Dummy Classifier

In [45]:
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)

In [46]:
confusion_matrix(y_test,dummy_pred)

array([[243, 164],
       [171, 161]], dtype=int64)

In [47]:
 print(classification_report(y_test,dummy_pred))

              precision    recall  f1-score   support

           0       0.59      0.60      0.59       407
           1       0.50      0.48      0.49       332

   micro avg       0.55      0.55      0.55       739
   macro avg       0.54      0.54      0.54       739
weighted avg       0.55      0.55      0.55       739



## Logistic Regression

In [63]:
logr = LogisticRegression()
logr.fit(X_train, y_train)
logr_pred = logr.predict(X_test)



In [64]:
confusion_matrix(y_test,logr_pred)

array([[278, 129],
       [143, 189]], dtype=int64)

In [50]:
 print(classification_report(y_test,logr_pred))

              precision    recall  f1-score   support

           0       0.55      0.50      0.52       407
           1       0.45      0.50      0.47       332

   micro avg       0.50      0.50      0.50       739
   macro avg       0.50      0.50      0.50       739
weighted avg       0.50      0.50      0.50       739



## Decision Tree Classifier

In [51]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

In [52]:
confusion_matrix(y_test, tree_pred)

array([[280, 127],
       [159, 173]], dtype=int64)

In [53]:
 print(classification_report(y_test,tree_pred))

              precision    recall  f1-score   support

           0       0.64      0.69      0.66       407
           1       0.58      0.52      0.55       332

   micro avg       0.61      0.61      0.61       739
   macro avg       0.61      0.60      0.60       739
weighted avg       0.61      0.61      0.61       739



## Random Forest Classifier

In [54]:
forest = DecisionTreeClassifier()
forest.fit(X_train, y_train)
forest_pred = forest.predict(X_test)

In [55]:
confusion_matrix(y_test, forest_pred)

array([[279, 128],
       [158, 174]], dtype=int64)

In [56]:
 print(classification_report(y_test,forest_pred))

              precision    recall  f1-score   support

           0       0.64      0.69      0.66       407
           1       0.58      0.52      0.55       332

   micro avg       0.61      0.61      0.61       739
   macro avg       0.61      0.60      0.61       739
weighted avg       0.61      0.61      0.61       739



## Best?
Logistic Regression performed the best so we will proceed on with that model

In [65]:
logr.coef_   #forest.feature_importances_  is the forest version

array([[ 0.43105213, -0.53578635, -0.27264783, -0.19102763, -0.69346263,
        -0.2396344 , -0.91952661,  0.25177426,  0.28897877,  0.27492241,
        -0.31521454,  0.00932053,  0.7821303 , -0.22553831,  0.72089239,
        -0.4932887 , -0.2423543 ,  0.39891887,  2.03878719, -0.05490845,
         0.14166907,  0.62622212, -0.37499843, -1.00672746,  0.34410131,
         0.47823845, -0.17378206,  0.00925971, -1.47647622, -0.30824184,
        -0.61711835,  0.12429517, -0.52142775, -0.70086947, -0.08088334,
         0.3802256 , -0.42164532,  1.07631002,  0.15383294,  0.02937759,
         0.36769726,  1.24209835,  0.64097741,  0.52580419, -0.7778799 ,
         0.52273357,  0.11005851,  0.68473864, -1.10015608, -0.07946502,
        -1.35893679]])

In [70]:
logr_feature = pd.DataFrame({"feature": X.columns,
                            "importance": logr.coef_[0]})

logr_feature.sort_values(by = "importance", ascending = False)

Unnamed: 0,feature,importance
18,from_MyKuali,2.038787
41,from_Indonesia,1.242098
37,is_Others,1.07631
12,from_MAMA,0.78213
14,from_Mama,0.720892
47,from_Taiwan,0.684739
42,from_Japan,0.640977
21,from_Nongshim,0.626222
43,from_Malaysia,0.525804
45,from_Singapore,0.522734
