In [1]:
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation


In [2]:
df = pd.read_csv("data/ramen-ratings.csv")
df.columns

Index(['Review #', 'Brand', 'Variety', 'Style', 'Country', 'Stars', 'Top Ten',
       'Like'],
      dtype='object')

In [3]:
df.head()

Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten,Like
0,2580,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75,,0
1,2579,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1.0,,1
2,2578,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,2.25,,0
3,2577,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,2.75,,0
4,2576,Ching's Secret,Singapore Curry,Pack,India,3.75,,1


In [4]:
print("\nBrand Column Description:")
print(df.Brand.describe())
print("\nVariety Column Description:")
print(df.Variety.describe())
print("\nStyle Column Description:")
print(df.Style.describe())
print("\nCountry Column Description:")
print(df.Country.describe())
print("\nStars Column Description:")
print(df.Stars.describe())


Brand Column Description:
count       2580
unique       355
top       Nissin
freq         381
Name: Brand, dtype: object

Variety Column Description:
count     2580
unique    2413
top       Beef
freq         7
Name: Variety, dtype: object

Style Column Description:
count     2578
unique       7
top       Pack
freq      1531
Name: Style, dtype: object

Country Column Description:
count      2580
unique       38
top       Japan
freq        352
Name: Country, dtype: object

Stars Column Description:
count     2580
unique      43
top          4
freq       393
Name: Stars, dtype: object


Feature Selection - divide columns into dependent and independent variables

In [5]:
# Display rows with missing values
print(df[df.isnull().any(axis=1)])

# Drop rows with any missing values
df = df.dropna()

print("\nShape of the cleaned dataset:", df.shape)

      Review #           Brand   
0         2580       New Touch  \
1         2579        Just Way   
2         2578          Nissin   
3         2577         Wei Lih   
4         2576  Ching's Secret   
...        ...             ...   
2575         5           Vifon   
2576         4         Wai Wai   
2577         3         Wai Wai   
2578         2         Wai Wai   
2579         1        Westbrae   

                                                Variety Style   Country Stars   
0                             T's Restaurant Tantanmen    Cup     Japan  3.75  \
1     Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...  Pack    Taiwan     1   
2                         Cup Noodles Chicken Vegetable   Cup       USA  2.25   
3                         GGE Ramen Snack Tomato Flavor  Pack    Taiwan  2.75   
4                                       Singapore Curry  Pack     India  3.75   
...                                                 ...   ...       ...   ...   
2575  Hu Tiu Nam Vang ["

In [6]:
from sklearn.preprocessing import LabelEncoder

# Clean data
# Convert 'Stars' to numeric
df['Stars'] = pd.to_numeric(df['Stars'], errors='coerce')

# Drop 'Top Ten' column
df.drop(columns=['Top Ten'], inplace=True)

# One-Hot Encoding for 'Style'
df = pd.get_dummies(df, columns=['Style'])

# Label Encoding for 'Country'
label_encoder = LabelEncoder()
df['Country'] = label_encoder.fit_transform(df['Country'])

df.head()

Unnamed: 0,Review #,Brand,Variety,Country,Stars,Like,Style_Bowl,Style_Cup,Style_Pack,Style_Tray
616,1964,MAMA,Instant Noodles Coconut Milk Flavour,5,5.0,1,False,False,True,False
633,1947,Prima Taste,Singapore Laksa Wholegrain La Mian,6,5.0,0,False,False,True,False
655,1925,Prima,Juzz's Mee Creamy Chicken Flavour,6,5.0,0,False,False,True,False
673,1907,Prima Taste,Singapore Curry Wholegrain La Mian,6,5.0,0,False,False,True,False
752,1828,Tseng Noodles,Scallion With Sichuan Pepper Flavor,8,5.0,0,False,False,True,False


In [7]:
# Define features and target variable
feature_cols = ['Stars'] + [col for col in df.columns if 'Style_' in col] + ['Country']
X = df[feature_cols]  # Features
y = df['Brand']       # Target variable

In [8]:
X.head()

Unnamed: 0,Stars,Style_Bowl,Style_Cup,Style_Pack,Style_Tray,Country
616,5.0,False,False,True,False,5
633,5.0,False,False,True,False,6
655,5.0,False,False,True,False,6
673,5.0,False,False,True,False,6
752,5.0,False,False,True,False,8


In [9]:
from sklearn.metrics import accuracy_score

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Create Decision Tree classifier object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifier
clf = clf.fit(X_train, y_train)

# Predict the response for the test dataset
y_pred = clf.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5384615384615384


Visualising Decision Trees

In [10]:
!pip install graphviz
!pip install pydotplus



In [11]:
# https://www.datacamp.com/tutorial/decision-tree-classification-python?utm_source=google&utm_medium=paid_search&utm_campaignid=19589720824&utm_adgroupid=157156376311&utm_device=c&utm_keyword=&utm_matchtype=&utm_network=g&utm_adpostion=&utm_creative=683184495095&utm_targetid=dsa-2218886984100&utm_loc_interest_ms=&utm_loc_physical_ms=1012728&utm_content=&utm_campaign=230119_1-sea~dsa~tofu_2-b2c_3-row-p2_4-prc_5-na_6-na_7-le_8-pdsh-go_9-na_10-na_11-na-dec23&gad_source=1&gclid=Cj0KCQiAm4WsBhCiARIsAEJIEzVpeaHgk2DjcNmV69lOqKe-j79duYNeY1UTtDF1hqu0RH6UpmncnB8aAg1iEALw_wcB

from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image
import io

# Assuming clf is your trained Decision Tree Classifier
dot_data = io.StringIO()

# Exporting the decision tree to a dot format
export_graphviz(clf, out_file=dot_data, 
                filled=True, rounded=True,
                special_characters=True, feature_names=feature_cols, class_names=y.unique())

# Converting the dot data to a graph
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

# Displaying the graph
Image(graph.create_png())



InvocationException: GraphViz's executables not found