In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
from fuzzywuzzy import fuzz
from tqdm import tqdm, tqdm_notebook
import random 
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression

from knowledge_graph import KG
from task_utils import *

In [3]:
kg = KG("data/uniform/usage")

Loading data from ../data/uniform/usage...
Loaded 369667 triples.


In [5]:
kg.load_tasks()

Task: genre
genre (train): 1054 samples
genre (valid): 132 samples
genre (test): 132 samples

Task: type
type (train): 13265 samples
type (valid): 1658 samples
type (test): 1659 samples



## Task 1: Genre Prediction


In [6]:
# construct feature dictionaries 
xtr = kg.tasks['genre']['train']['X']
ytr = kg.tasks['genre']['train']['Y']

xts = kg.tasks['genre']['test']['X']
yts = kg.tasks['genre']['test']['Y']

In [7]:
# featurize training data
train_dict = defaultdict(list) # entity -> list of "$feature $value" 
triples = kg.filter_triples(head_filter=xtr)
for head, arc, tail, tail_type, _ in tqdm(triples):
    if tail_type == "entity":
        continue 
    train_dict[head].append("%s %s" % (arc, tail))

train_sent = []
for entity in xtr:
    val_list = train_dict[entity]
    train_sent.append(' '.join(val_list))
    
# featurize test data
test_dict = defaultdict(list) # entity -> list of "$feature $value" 
triples = kg.filter_triples(head_filter=xts)
for head, arc, tail, tail_type, _ in tqdm(triples):
    if tail_type == "entity":
        continue 
    test_dict[head].append("%s %s" % (arc, tail))

test_sent = []
for entity in xts:
    val_list = test_dict[entity]
    test_sent.append(' '.join(val_list))


# vectorize features
vectorizer = CountVectorizer(ngram_range=(1, 2))
Xtr = vectorizer.fit_transform(train_sent)
Xts = vectorizer.transform(test_sent)

# convert grades to labels
le = LabelEncoder()
Ytr = le.fit_transform(ytr)
Yts = le.transform(yts)

100%|██████████| 38718/38718 [00:00<00:00, 1283146.83it/s]
100%|██████████| 8188/8188 [00:00<00:00, 1275599.34it/s]


In [8]:
model = LogisticRegression().fit(Xtr, Ytr)
predictions = model.predict(Xts)

In [9]:
accuracy = evaluate_multi_classification(Yts, predictions)
print("Accuracy: %f" % accuracy)

Accuracy: 0.287879


## Task 2: Type Prediction


In [14]:
# construct feature dictionaries 
xtr = kg.tasks['type']['train']['X']
ytr = kg.tasks['type']['train']['Y']

xts = kg.tasks['type']['test']['X']
yts = kg.tasks['type']['test']['Y']

In [15]:
# featurize training data
train_dict = defaultdict(list) # entity -> list of "$feature $value" 
triples = kg.filter_triples(head_filter=xtr)
for head, arc, tail, tail_type, _ in tqdm(triples):
    if tail_type == "entity":
        continue 
    train_dict[head].append("%s %s" % (arc, tail))

train_sent = []
for entity in xtr:
    val_list = train_dict[entity]
    train_sent.append(' '.join(val_list))
    
# featurize test data
test_dict = defaultdict(list) # entity -> list of "$feature $value" 
triples = kg.filter_triples(head_filter=xts)
for head, arc, tail, tail_type, _ in tqdm(triples):
    if tail_type == "entity":
        continue 
    test_dict[head].append("%s %s" % (arc, tail))

test_sent = []
for entity in xts:
    val_list = test_dict[entity]
    test_sent.append(' '.join(val_list))


# vectorize features
vectorizer = CountVectorizer(ngram_range=(1, 2))
Xtr = vectorizer.fit_transform(train_sent)
Xts = vectorizer.transform(test_sent)

# convert grades to labels
le = LabelEncoder()
Ytr = le.fit_transform(ytr)
Yts = le.transform(yts)

100%|██████████| 204659/204659 [00:00<00:00, 1393564.14it/s]
100%|██████████| 40843/40843 [00:00<00:00, 956002.27it/s]


In [16]:
model = LogisticRegression().fit(Xtr, Ytr)
predictions = model.predict(Xts)

In [17]:
accuracy = evaluate_multi_classification(Yts, predictions)
print("Accuracy: %f" % accuracy)

Accuracy: 0.690175
