In [1]:
%load_ext autoreload
%autoreload 2

In [103]:
from collections import defaultdict
from fuzzywuzzy import fuzz
from tqdm import tqdm, tqdm_notebook
import random 
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression

from knowledge_graph import KG
from task_utils import *

In [23]:
kg = KG("data/food/usage")

Loading data from ../data/food/usage...
Loaded 987899 triples.


In [24]:
kg.load_tasks()

Task: category_prediction
category_prediction (train): 13132 samples
category_prediction (valid): 1641 samples
category_prediction (test): 1642 samples

Task: energy_prediction
energy_prediction (train): 13600 samples
energy_prediction (valid): 1700 samples
energy_prediction (test): 1701 samples

Task: grade_prediction
grade_prediction (train): 13600 samples
grade_prediction (valid): 1700 samples
grade_prediction (test): 1701 samples



## Task 1: Nutrition Grade Prediction

In [39]:
# construct feature dictionaries 
xtr = kg.tasks['category_prediction']['train']['X']
ytr = kg.tasks['category_prediction']['train']['Y']

xts = kg.tasks['category_prediction']['test']['X']
yts = kg.tasks['category_prediction']['test']['Y']

In [37]:
# featurize training data
train_dict = defaultdict(list) # entity -> list of "$feature $value" 
triples = kg.filter_triples(head_filter=xtr)
for head, arc, tail, tail_type, _ in tqdm(triples):
    if tail_type == "entity":
        continue 
    train_dict[head].append("%s %s" % (arc, tail))

train_sent = []
for entity in xtr:
    val_list = train_dict[entity]
    train_sent.append(' '.join(val_list))
    
# featurize test data
test_dict = defaultdict(list) # entity -> list of "$feature $value" 
triples = kg.filter_triples(head_filter=xts)
for head, arc, tail, tail_type, _ in tqdm(triples):
    if tail_type == "entity":
        continue 
    test_dict[head].append("%s %s" % (arc, tail))

test_sent = []
for entity in xts:
    val_list = test_dict[entity]
    test_sent.append(' '.join(val_list))

    
vectorizer = CountVectorizer(ngram_range=(1, 2))
Xtr = vectorizer.fit_transform(train_sent)
Xts = vectorizer.transform(test_sent)


100%|██████████| 712317/712317 [00:00<00:00, 1298007.87it/s]
100%|██████████| 88757/88757 [00:00<00:00, 822255.79it/s]


In [45]:
# transform labels
mlb = MultiLabelBinarizer()
Ytr = mlb.fit_transform(ytr)
Yts = mlb.transform(yts)

In [48]:
# fit classifier 
classif = OneVsRestClassifier(SVC(kernel='linear'))
classif.fit(Xtr, Ytr)

OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1)

In [64]:
y_preds = classif.predict(Xts)

In [75]:
precision, recall, f1 = evaluate_multilabel_classification(Yts, y_preds)
print("Precision: %f. Recall: %f. F1: %f" % (precision, recall, f1))

Precision: 0.855257. Recall: 0.612395. F1: 0.713732


## Task 2: Energy Prediction

In [81]:
# construct feature dictionaries 
xtr = kg.tasks['energy_prediction']['train']['X']
ytr = [float(x) for x in kg.tasks['energy_prediction']['train']['Y']]

xts = kg.tasks['energy_prediction']['test']['X']
yts = [float(x) for x in kg.tasks['energy_prediction']['test']['Y']]

In [77]:
# featurize training data
train_dict = defaultdict(list) # entity -> list of "$feature $value" 
triples = kg.filter_triples(head_filter=xtr)
for head, arc, tail, tail_type, _ in tqdm(triples):
    if tail_type == "entity":
        continue 
    train_dict[head].append("%s %s" % (arc, tail))

train_sent = []
for entity in xtr:
    val_list = train_dict[entity]
    train_sent.append(' '.join(val_list))
    
# featurize test data
test_dict = defaultdict(list) # entity -> list of "$feature $value" 
triples = kg.filter_triples(head_filter=xts)
for head, arc, tail, tail_type, _ in tqdm(triples):
    if tail_type == "entity":
        continue 
    test_dict[head].append("%s %s" % (arc, tail))

test_sent = []
for entity in xts:
    val_list = test_dict[entity]
    test_sent.append(' '.join(val_list))

    
vectorizer = CountVectorizer(ngram_range=(1, 2))
Xtr = vectorizer.fit_transform(train_sent)
Xts = vectorizer.transform(test_sent)


100%|██████████| 736734/736734 [00:00<00:00, 1246211.61it/s]
100%|██████████| 92577/92577 [00:00<00:00, 866647.21it/s]


In [84]:
model = LinearRegression().fit(Xtr, ytr)
predictions = model.predict(Xts)

In [86]:
mse = evaluate_regression(yts, predictions)
print("Mean-square error:", mse)

Mean-square error: 118114.93527223503


## Task 3: Grade Prediction


In [88]:
# construct feature dictionaries 
xtr = kg.tasks['grade_prediction']['train']['X']
ytr = kg.tasks['grade_prediction']['train']['Y']

xts = kg.tasks['grade_prediction']['test']['X']
yts = kg.tasks['grade_prediction']['test']['Y']

In [91]:
# featurize training data
train_dict = defaultdict(list) # entity -> list of "$feature $value" 
triples = kg.filter_triples(head_filter=xtr)
for head, arc, tail, tail_type, _ in tqdm(triples):
    if tail_type == "entity":
        continue 
    train_dict[head].append("%s %s" % (arc, tail))

train_sent = []
for entity in xtr:
    val_list = train_dict[entity]
    train_sent.append(' '.join(val_list))
    
# featurize test data
test_dict = defaultdict(list) # entity -> list of "$feature $value" 
triples = kg.filter_triples(head_filter=xts)
for head, arc, tail, tail_type, _ in tqdm(triples):
    if tail_type == "entity":
        continue 
    test_dict[head].append("%s %s" % (arc, tail))

test_sent = []
for entity in xts:
    val_list = test_dict[entity]
    test_sent.append(' '.join(val_list))


# vectorize features
vectorizer = CountVectorizer(ngram_range=(1, 2))
Xtr = vectorizer.fit_transform(train_sent)
Xts = vectorizer.transform(test_sent)

# convert grades to labels
le = LabelEncoder()
Ytr = le.fit_transform(ytr)
Yts = le.transform(yts)

100%|██████████| 738457/738457 [00:00<00:00, 1254897.60it/s]
100%|██████████| 92533/92533 [00:00<00:00, 845979.12it/s]


In [102]:
model = LogisticRegression().fit(Xtr, Ytr)
predictions = model.predict(Xts)

In [104]:
accuracy = evaluate_multi_classification(Yts, predictions)
print("Accuracy: %f" % accuracy)

Accuracy: 0.865961
