Merge pull request #205 from cchrewrite/dev

Update food recommendation model
nusdbsystem · Nov 10, 2022 · aa15621 · aa15621
2 parents d7c68a7 + 54b54de
commit aa15621
Show file tree

Hide file tree

Showing 3 changed files with 485 additions and 12 deletions.
diff --git a/examples/models/food_analysis/MLPFoodRecommendationModel.py b/examples/models/food_analysis/MLPFoodRecommendationModel.py
@@ -23,10 +23,9 @@
 import argparse
 import os
 import random
-from sklearn.ensemble import RandomForestClassifier
 from sklearn.neural_network import MLPClassifier
 
-from singa_auto.model import BaseModel, utils
+from singa_auto.model import BaseModel, IntegerKnob, utils
 from singa_auto.constants import ModelDependency
 from singa_auto.model.dev import test_model_class
 from singa_auto.datasets.image_classification_dataset import ImageDataset4Clf
@@ -42,7 +41,10 @@ class MLPFoodRecommendationModel(BaseModel):
 
     @staticmethod
     def get_knob_config():
-        return {}
+        return {
+            'num_hid_layers': IntegerKnob(3, 6),
+            'num_hid_units': IntegerKnob(64, 512)
+        }
 
     def __init__(self, **knobs):
 
@@ -169,7 +171,7 @@ def knowledge_base_weight_normalisation(self):
                 for t in self.KB[h][p]:
                     self.KB[h][p][t] = self.KB[h][p][t] / imp_sum
         return 0
-    
+
     def sample_triples_from_knowledge_base(self, entity, n):
         result = []
         h = entity
@@ -256,9 +258,7 @@ def read_dataset(self, fpath):
                 for j in range(encodings.shape[1]):
                     feat.append(encodings[i][j])
                     tgt.append(int(y[1]))
-                    #input(y[0])
-                    #input(feat[-1])
-                    #input(tgt[-1])
+
         #feat = np.array(feat)
         #tgt = np.array(tgt) 
         return feat, tgt
@@ -307,10 +307,13 @@ def train(self, dataset_path, work_dir,  **kwargs):
 
         self.clf = dict()
         for tag in self.tag_list:
-            self.clf[tag] = MLPClassifier(random_state = 1, max_iter = 100, solver = "lbfgs", hidden_layer_sizes = (128, 128, 128, 128))
-            # can use random forests if MLP is too slow.
-            #self.clf[tag] = RandomForestClassifier(n_estimators = 10, random_state=0)
+            num_hid_layers = self._knobs.get("num_hid_layers")
+            num_hid_units = self._knobs.get("num_hid_units")
+            hidden_layer_sizes = [int(num_hid_units)] * int(num_hid_layers)
+
+            self.clf[tag] = MLPClassifier(random_state = 1, max_iter=1000, solver = "lbfgs", hidden_layer_sizes = hidden_layer_sizes)
 
+            #self.clf[tag] = MLPClassifier(random_state = 1, max_iter = 100, solver = "lbfgs", hidden_layer_sizes = (128, 128, 128, 128))
 
         print("Reading knowledge base...")
         kb_path = "%s/training_data/food_knowledge_base.tri"%work_dir
@@ -410,7 +413,7 @@ def print_knowledge_graph(self):
 
     (args, _) = parser.parse_known_args()
 
-    queries = [str(["海菜", "puerpera_tag"]), str(["鱼肉", "pregnant_tag"]), str(["Mars", "pregnant_tag"])]
+    queries = [str(["海菜", "puerpera_tag"]), str(["鱼肉", "pregnant_tag"]), str(["Milk", "pregnant_tag"])]
 
     test_model_class(model_file_path=__file__,
                      model_class='MLPFoodRecommendationModel',
@@ -419,5 +422,6 @@ def print_knowledge_graph(self):
                      train_dataset_path=args.train_path,
                      val_dataset_path=args.val_path,
                      #test_dataset_path=args.test_path,
+                     budget={'MODEL_TRIAL_COUNT': 10, 'TIME_HOURS': 1.0},
                      queries=queries)
 
diff --git a/examples/models/food_analysis/README.md b/examples/models/food_analysis/README.md
@@ -1 +1,48 @@
-This folder includes food analysis models.
+# Singa-Auto Demo - Food Recommendation.
+
+This folder contains a number of models for food recommendation with knowledge graphs.
+
+## Dataset Preparation
+
+The training and evaluation data should be compressed into a single .tar file. The two tar files contain a "training_data" folder and an "evaluation_data", respectively.
+
+The "training_data" folder has the following files:
+
+(1) food_knowledge_base.tri: It is a knowledge base containing triples of the form "\<subject\> \<predicate\> \<object\>". For example:
+
+milk contain protein  
+milk contain vitamin_a  
+protein is_good_for brain  
+...  
+prawn contain protein  
+
+(2) tag_list.txt: It contains N prediction tags. For example:
+
+pregnant_tag  
+puerpera_tag  
+lactation_tag  
+baby_tag  
+
+(3) N files named "\<tag_name\>_training.txt". Each file contains training data of the form "\<entity\> \<class\>", which indicates the class of the entity with respect to the tag. For example, let Class 0 denote food suitable for a baby, and Class 1 denote food not suitable for a baby, we have a file named "baby_tag_training.txt":
+
+milk 0  
+prawn 1  
+...  
+orange 0  
+
+The data is used to train classifiers predicting the probability that a given entity belongs to each class. Trained classifiers are evaluated using the "evaluation_data" folder that has N files named "\<tag_name\>_evaluation.txt". The format of evaluation data is the same as training data.
+
+## Prediction/Inference
+
+A query should be a Python list of strings. Each string is of the form "[\<entity\>, \<tag\>]". For example:
+
+[str(["milk", "baby_tag"]), str(["prawn", "baby_tag"]), str(["orange", "baby_tag"])]
+
+The model will return the probability that a given entity belongs to each class with respect to a given tag.
+
+## Model Description
+
+There are two models:
+1. RFFoodRecommendationModel.py, which is a random forest. It performs auto parameter tuning on the number of estimators.
+2. MLPFoodRecommendationModel.py, which is a feedforward neural network. It performs auto parameter tuning on the number of hidden layers and hidden units.
+