In [60]:
# Import the DMOZ domain category dataset
# (downloaded from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/OMV93V)
import csv
with open('../dmoz-data/dmoz_domain_category.csv', 'r') as f:
  next(f) # Skip the header line
  reader = csv.reader(f)
  raw_categories = list(reader)

In [3]:
# Take a look at one record
raw_categories[1]

['www.232analyzer.com', 'Top/Computers/Hardware/Test_Equipment/Analyzers']

In [4]:
# Make a dictionary of short domains (without www.) to top-level category label, as per this page:
# http://dmoztools.net
labels={}
prefix="www."
for row in raw_categories:
    fulldomain = row[0]
    shortdomain = fulldomain[len(prefix):] if fulldomain.startswith(prefix) else fulldomain
    label = row[1].split("/")[1].split("|")[0]
    labels[shortdomain]=label  
    #print(shortdomain + " " + label)

In [5]:
# Take a look at the category for one domain from our dictionary
labels['232analyzer.com']

'Computers'

In [6]:
# Import JSON embeddings dictionary from Tom E, filtering for extra quotes using:
# cat edges-1000-lookup.json | sed -e 's/\\"//g' > edges-1000-lookup2.json 
import json 
js=open('../cc-embeddings/edges-1000-lookup2.json') 
ids=json.load(js) 
ids['cmo.com']

720

In [61]:
# Find the domains that are present in both DMOZ and Tom's Embeddings
overlap=labels.keys() & ids.keys()
len(overlap)

382

In [28]:
# Create another dictionary with keys and values swapped
domains = dict((v,k) for k,v in ids.items())

In [11]:
# Summarize categories in the DMOZ data
from collections import Counter
Counter(labels.values())

Counter({'Arts': 66715,
         'Business': 148142,
         'Computers': 45193,
         'Games': 10246,
         'Health': 24218,
         'Home': 6951,
         'News': 3710,
         'Recreation': 46095,
         'Reference': 21663,
         'Regional': 642158,
         'Science': 28135,
         'Shopping': 54062,
         'Society': 82072,
         'Sports': 34885,
         'World': 1273938})

In [12]:
# Plot histogram of top-level DMOZ categories - not working
#import matplotlib.pyplot as plt
#plt.hist(labels.values())
#plt.show()

In [22]:
# Load Tom E's embedding vector IDs
import numpy as np
embs=open("../cc-embeddings/edges-1000-snap.emb")
next(embs) # skip header line
vec_ids = np.loadtxt(embs, delimiter=' ', usecols=0, dtype=int, unpack=False)
embs.close()
vec_ids[3]

311

In [62]:
# Load Tom E's 100-dimensional embedding vectors
import numpy as np
embs=open("../cc-embeddings/edges-1000-snap.emb")
next(embs)
vec_embs = np.loadtxt(embs, delimiter=' ', usecols=range(1,100), dtype=float, unpack=False)
embs.close()
vec_embs[3]

array([  1.38675000e-01,  -3.08216000e-01,   9.77799000e-01,
        -5.14710000e-01,  -2.53251000e-01,  -1.03932000e-01,
         6.19299000e-01,  -8.66894000e-01,  -1.98394000e-01,
        -6.51082000e-01,  -1.49361000e-01,  -3.43352000e-01,
        -5.41385000e-01,   1.03663000e+00,   8.17124000e-01,
         1.39814000e-02,  -2.90030000e-01,   1.24302000e+00,
         5.29870000e-01,   5.79753000e-01,   6.57062000e-02,
        -1.23863000e+00,   1.35813000e+00,  -3.60972000e-01,
        -2.42486000e-01,   1.67638000e-01,   9.91996000e-01,
         1.41447000e-01,  -4.87832000e-01,   2.14026000e-01,
         1.44295000e+00,   4.30515000e-01,  -6.37256000e-01,
         2.72207000e-01,   3.14449000e-01,  -1.54546000e-01,
        -1.01998000e+00,   3.43388000e-01,  -6.45086000e-02,
        -1.02615000e+00,  -5.11630000e-01,   5.87549000e-01,
        -8.73898000e-04,   5.80964000e-02,   7.47816000e-02,
         5.77048000e-02,   3.30381000e-01,  -1.17853000e-02,
         1.76377000e-01,

In [36]:
# Filter embeddings for only those vectors that have entries in the DMOZ dictionary (i.e. ground truth labels)
new_vec_ids=[]
new_vec_embs=[]
ground_truth=[]

# Iterate over all the domain IDs for which we have a vector embedding
for index,id in enumerate(vec_ids):
    
    if domains[id] in labels.keys(): # this domain is present in DMOZ so add to new list
    
        new_vec_ids.append(domains[id])
        new_vec_embs.append(vec_embs[index])
        ground_truth.append(labels[domains[id]])

# Verify lengths of each list
print(str(len(new_vec_ids)) + " " + str(len(new_vec_embs)) + " " + str(len(ground_truth)))

382 382 382


In [63]:
# Split into training and test sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(new_vec_embs, ground_truth, test_size=0.5, random_state=42)

In [57]:
# Summarize labels in our test data
Counter(y_test)

Counter({'Arts': 7,
         'Business': 12,
         'Computers': 78,
         'Home': 3,
         'News': 13,
         'Recreation': 1,
         'Reference': 1,
         'Regional': 19,
         'Science': 2,
         'Shopping': 7,
         'Society': 9,
         'World': 78})

In [64]:
# Fit classifiers to the training data
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

neigh = KNeighborsClassifier(n_neighbors=3, metric='cosine', algorithm='brute')
neigh.fit(X_train, y_train) 

rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [65]:
# Attempt to classify all test points using nearest neighbours
from sklearn.metrics import classification_report
print(classification_report(y_test, neigh.predict(X_test)))
print(classification_report(y_test, rf.predict(X_test)))

             precision    recall  f1-score   support

       Arts       0.00      0.00      0.00         6
   Business       0.00      0.00      0.00         9
  Computers       0.36      0.46      0.41        67
       Home       0.00      0.00      0.00         3
       News       0.17      0.09      0.12        11
 Recreation       0.00      0.00      0.00         1
  Reference       0.00      0.00      0.00         0
   Regional       0.27      0.21      0.24        14
    Science       0.00      0.00      0.00         2
   Shopping       0.00      0.00      0.00         6
    Society       0.00      0.00      0.00         6
      World       0.45      0.42      0.44        66

avg / total       0.31      0.33      0.32       191

             precision    recall  f1-score   support

       Arts       0.00      0.00      0.00         6
   Business       0.00      0.00      0.00         9
  Computers       0.39      0.67      0.49        67
       Home       0.00      0.00      0.00

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
