In [1]:
import pandas as pd
from sklearn import svm
import math
import numpy
from library import terms_to_graph, compute_node_centrality, print_top10
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [2]:
path_train = "../datasets/webkb-train-stemmed.txt"
path_test = "../datasets/webkb-test-stemmed.txt"

train = pd.read_csv(path_train, header=None, delimiter='\t')
test = pd.read_csv(path_test, header=None, delimiter='\t')

In [3]:
print "Shape of Train: ",train.shape
print "Head of Train: "
print train.ix[:5,:]

Shape of Train:  (2803, 2)
Head of Train: 
         0                                                  1
0  student  brian comput scienc depart univers wisconsin d...
1  student  denni swanson web page mail pop uki offic hour...
2  faculty  russel impagliazzo depart comput scienc engin ...
3  student  dave phd student depart comput scienc univers ...
4  project  center lifelong learn design univers colorado ...
5  faculty  steve liu associ professor depart comput scien...


In [4]:
print "Shape of Test: ",test.shape
print "Head of Test: "
print test.ix[:5,:]

Shape of Test:  (1396, 2)
Head of Test: 
         0                                                  1
0  student  eric homepag eric wei tsinghua physic fudan genet
1   course  comput system perform evalu model new sept ass...
2  student  home page comput scienc grad student ucsd work...
3  student  toni web page toni face thing call toni studen...
4   course  ec advanc comput architectur credit parallel a...
5  faculty  faculti member ci depart research interest par...


Clean train set

In [5]:
# get indexes to remove in train
index_remove = [ind for ind in range(len(train.ix[:,1])) if (train.ix[ind,1]!=train.ix[ind,1]) or ((train.ix[ind,1]==train.ix[ind,1])and(len(train.ix[ind,1].split(" "))<4))]

#remove
train = train.drop(train.index[index_remove])
print "Shape of Train: ",train.shape

Shape of Train:  (2774, 2)


Clean test set

In [6]:
# get indexes to remove in test and remove
index_remove = [ind for ind in range(len(test.ix[:,1])) if (test.ix[ind,1]!=test.ix[ind,1]) or ((test.ix[ind,1]==test.ix[ind,1]) and (len(test.ix[ind,1].split(" "))<4))]
test = test.drop(test.index[index_remove])
print "Shape of Test: ",test.shape

Shape of Test:  (1376, 2)


In [7]:
labels = train.ix[:,0]
unique_labels = list(set(labels))

truth = test.ix[:,0]
unique_truth = list(set(truth))

In [8]:
for label in unique_labels:
    print label,": ",len([lab for lab in labels if lab==label])

project :  335
course :  620
student :  1075
faculty :  744


Storint terms

In [9]:
# storing terms of train documents
terms_by_docs = [document.split(" ") for document in train.ix[:,1]]
n_terms_per_doc = [len(terms) for terms in terms_by_docs]

# storing terms of test documents
terms_by_docs_test = [document.split(" ") for document in test.ix[:,1]]
n_terms_per_doc_test = [len(terms) for terms in terms_by_docs_test]

In [10]:
print "Min terms of train doc: ",min(n_terms_per_doc)
print "Max terms of train doc:",max(n_terms_per_doc)
print "Average terms of train doc: ",sum(n_terms_per_doc)/len(n_terms_per_doc)

Min terms of train doc:  4
Max terms of train doc: 20628
Average terms of train doc:  134


In [11]:
# store all terms
all_terms = [term for sublist in terms_by_docs for term in sublist]

# compute average number of terms
avg_len = sum(n_terms_per_doc)/len(n_terms_per_doc)

# unique terms
all_unique_terms = list(set(all_terms))

Inverse Document Frequency Number

In [12]:
n_doc = len(labels)

idf = dict(zip(all_unique_terms,[0]*len(all_unique_terms)))
counter = 0

for element in idf.keys():
    # number of document for each term appear
    df = sum([element in terms for terms in terms_by_docs])
    # idf
    idf[element] = math.log10(float(n_doc+1)/df)

    counter += 1
    if counter % 200 == 0:
        print counter," terms have been processed"

200  terms have been processed
400  terms have been processed
600  terms have been processed
800  terms have been processed
1000  terms have been processed
1200  terms have been processed
1400  terms have been processed
1600  terms have been processed
1800  terms have been processed
2000  terms have been processed
2200  terms have been processed
2400  terms have been processed
2600  terms have been processed
2800  terms have been processed
3000  terms have been processed
3200  terms have been processed
3400  terms have been processed
3600  terms have been processed
3800  terms have been processed
4000  terms have been processed
4200  terms have been processed
4400  terms have been processed
4600  terms have been processed
4800  terms have been processed
5000  terms have been processed
5200  terms have been processed
5400  terms have been processed
5600  terms have been processed
5800  terms have been processed
6000  terms have been processed
6200  terms have been processed
6400  terms 

<h3>Train & Test 

train part

In [14]:
all_graphs = []

for terms in terms_by_docs:
    g = terms_to_graph(terms,w=3)
    all_graphs.append(g)
print len(terms_by_docs) == len(all_graphs)
print len(set(terms_by_docs[0])) == len(all_graphs[0].vs)

True
True


In [16]:
# Computing vectors representations of each train document
b = 0.003
features_degree = []
features_w_degree = []
features_closeness = []
features_w_closeness = []
features_tfidf = []

len_all = len(all_unique_terms)
counter = 0
idf_keys = idf.keys()

In [17]:
for i in xrange(len(all_graphs)):
    graph = all_graphs[i]
    terms_in_doc = terms_by_docs[i]
    doc_len = len(terms_in_doc)

    metrics = compute_node_centrality(graph)

    # returns node (1) name, (2) degree, (3) weighted degree, (4) closeness, (5) weighted closeness
    feature_row_degree = [0]*len_all
    feature_row_w_degree = [0]*len_all
    feature_row_closeness = [0]*len_all
    feature_row_w_closeness = [0]*len_all
    feature_row_tfidf = [0]*len_all

    for term in list(set(terms_in_doc)):
        index = all_unique_terms.index(term)
        idf_term = idf[term]
        denominator = (1-b+(b*(float(doc_len)/avg_len)))
        metrics_term = [tuple[1:5] for tuple in metrics if tuple[0]==term][0]

        # store TW-IDF value
        feature_row_degree[index] = (float(metrics_term[0])/denominator) * idf_term
        feature_row_w_degree[index] = (float(metrics_term[1])/denominator) * idf_term
        feature_row_closeness[index] = (float(metrics_term[2])/denominator) * idf_term
        feature_row_w_closeness[index] = (float(metrics_term[3])/denominator) * idf_term

        # number of occurences of word in document
        tf = terms_in_doc.count(term)

        # store TF-IDF value
        feature_row_tfidf[index] = ((1+math.log1p(1+math.log1p(tf)))/(1-b+(b*(float(doc_len)/avg_len)))) * idf_term

    features_degree.append(feature_row_degree)
    features_w_degree.append(feature_row_w_degree)
    features_closeness.append(feature_row_closeness)
    features_w_closeness.append(feature_row_w_closeness)
    features_tfidf.append(feature_row_tfidf)

    counter += 1
    if counter % 100 == 0:
        print counter,"documents have been processed"

100 documents have been processed
200 documents have been processed
300 documents have been processed
400 documents have been processed
500 documents have been processed
600 documents have been processed
700 documents have been processed
800 documents have been processed
900 documents have been processed
1000 documents have been processed
1100 documents have been processed
1200 documents have been processed
1300 documents have been processed
1400 documents have been processed
1500 documents have been processed
1600 documents have been processed
1700 documents have been processed
1800 documents have been processed
1900 documents have been processed
2000 documents have been processed
2100 documents have been processed
2200 documents have been processed
2300 documents have been processed
2400 documents have been processed
2500 documents have been processed
2600 documents have been processed
2700 documents have been processed


In [18]:
# convert list of lists into array
# documents as rows, terms as colunms ==>> document-terms metrix
training_set_tfidf = numpy.array(features_tfidf)
training_set_degree = numpy.array(features_degree)
training_set_w_degree = numpy.array(features_w_degree)
training_set_closeness = numpy.array(features_closeness)
training_set_w_closeness = numpy.array(features_w_closeness)

In [19]:
# convert labels into integers then into column array
labels = list(labels)

labels_int = [0]*len(labels)
for j in xrange(len(unique_labels)):
    index_temp = [i for i in range(len(labels)) if labels[i]==unique_labels[j]]
    for element in index_temp:
        labels_int[element]=j

#check that coding smoothly
print zip(labels_int,labels)[:20]

[(2, 'student'), (2, 'student'), (3, 'faculty'), (2, 'student'), (0, 'project'), (3, 'faculty'), (3, 'faculty'), (3, 'faculty'), (2, 'student'), (1, 'course'), (2, 'student'), (3, 'faculty'), (2, 'student'), (2, 'student'), (3, 'faculty'), (3, 'faculty'), (3, 'faculty'), (2, 'student'), (3, 'faculty'), (2, 'student')]


In [20]:
labels_array = numpy.array(labels_int)

Test part

In [25]:
all_graphs_test = []

for terms in terms_by_docs_test:
    g = terms_to_graph(terms,w=3)
    all_graphs_test.append(g)

# sanity checks
print len(terms_by_docs_test)==len(all_graphs_test)
print len(set(terms_by_docs_test[0]))==len(all_graphs_test[0].vs)

True
True


In [26]:
print "computing vector representations of each testing document"
# each testing document is represented in the training space only

features_degree_test = []
features_w_degree_test = []
features_closeness_test = []
features_w_closeness_test = []
features_tfidf_test = []

counter = 0

for i in xrange(len(all_graphs_test)):

    graph = all_graphs_test[i]
	# retain only the terms originally present in the training set
    terms_in_doc = [term for term in terms_by_docs_test[i] if term in all_unique_terms]
    doc_len = len(terms_in_doc)

    # returns node (1) name, (2) degree, (3) weighted degree, (4) closeness, (5) weighted closeness
    metrics = compute_node_centrality(graph)

    feature_row_degree_test = [0]*len_all
    feature_row_w_degree_test = [0]*len_all
    feature_row_closeness_test = [0]*len_all
    feature_row_w_closeness_test = [0]*len_all
    feature_row_tfidf_test = [0]*len_all

    for term in list(set(terms_in_doc)):
        index = all_unique_terms.index(term)
        idf_term = idf[term]
        denominator = (1-b+(b*(float(doc_len)/avg_len)))
        metrics_term = [tuple[1:5] for tuple in metrics if tuple[0]==term][0]

        # store TW-IDF values
        feature_row_degree_test[index] = (float(metrics_term[0])/denominator) * idf_term
        feature_row_w_degree_test[index] = (float(metrics_term[1])/denominator) * idf_term
        feature_row_closeness_test[index] = (float(metrics_term[2])/denominator) * idf_term
        feature_row_w_closeness_test[index] = (float(metrics_term[3])/denominator) * idf_term

        # number of occurences of word in document
        tf = terms_in_doc.count(term)

        # store TF-IDF value
        feature_row_tfidf_test[index] = ((1+math.log1p(1+math.log1p(tf)))/(1-0.2+(0.2*(float(doc_len)/avg_len)))) * idf_term

    features_degree_test.append(feature_row_degree_test)
    features_w_degree_test.append(feature_row_w_degree_test)
    features_closeness_test.append(feature_row_closeness_test)
    features_w_closeness_test.append(feature_row_w_closeness_test)
    features_tfidf_test.append(feature_row_tfidf_test)

    counter += 1
    if counter % 100 == 0:
        print counter, "documents have been processed"

computing vector representations of each testing document
100 documents have been processed
200 documents have been processed
300 documents have been processed
400 documents have been processed
500 documents have been processed
600 documents have been processed
700 documents have been processed
800 documents have been processed
900 documents have been processed
1000 documents have been processed
1100 documents have been processed
1200 documents have been processed
1300 documents have been processed


In [27]:
# convert list of lists into array
# documents as rows, unique words as columns (i.e., document-term matrix)

testing_set_degree = numpy.array(features_degree_test)
testing_set_w_degree = numpy.array(features_w_degree_test)
testing_set_closeness = numpy.array(features_closeness_test)
testing_set_w_closeness = numpy.array(features_w_closeness_test)
testing_set_tfidf = numpy.array(features_tfidf_test)

In [28]:
# convert truth into integers then into column array
truth = list(truth)

truth_int = [0] * len(truth)
for j in range(len(unique_truth)):
    index_temp = [i for i in range(len(truth)) if truth[i]==unique_truth[j]]
    for element in index_temp:
        truth_int[element] = j

# check that coding went smoothly
print zip(truth_int,truth)[:20]

truth_array = numpy.array(truth_int)

[(2, 'student'), (1, 'course'), (2, 'student'), (2, 'student'), (1, 'course'), (3, 'faculty'), (1, 'course'), (2, 'student'), (3, 'faculty'), (0, 'project'), (2, 'student'), (3, 'faculty'), (2, 'student'), (1, 'course'), (3, 'faculty'), (1, 'course'), (1, 'course'), (2, 'student'), (2, 'student'), (0, 'project')]


Naive Bayes Clf

In [30]:
nb_tfidf = MultinomialNB()
nb_degree = MultinomialNB()
nb_w_degree = MultinomialNB()
nb_closeness = MultinomialNB()
nb_w_closeness = MultinomialNB()

nb_tfidf.fit(training_set_tfidf,labels_array)
nb_degree.fit(training_set_degree,labels_array)
nb_w_degree.fit(training_set_w_degree,labels_array)
nb_closeness.fit(training_set_closeness, labels_array)
nb_w_closeness.fit(training_set_w_closeness, labels_array)

predictions_tfidf = nb_tfidf.predict(testing_set_tfidf)
predictions_degree = nb_degree.predict(testing_set_degree)
predictions_w_degree = nb_degree.predict(testing_set_w_degree)
predictions_closeness = nb_degree.predict(testing_set_closeness)
predictions_w_closeness = nb_w_closeness.predict(testing_set_w_closeness)

print "accuracy TF-IDF:", accuracy_score(truth_array,predictions_tfidf)
print "accuracy degree:", accuracy_score(truth_array,predictions_degree)
print "accuracy w_degree:", accuracy_score(truth_array,predictions_w_degree)
print "accuracy closeness:", accuracy_score(truth_array,predictions_closeness)
print "accuracy w_closeness:", accuracy_score(truth_array,predictions_w_closeness)

print "TFIDF:"
print_top10(all_terms,nb_tfidf,unique_labels)
print
print "degree:"
print_top10(all_terms,nb_degree,unique_labels)
print
print "w_degree:"
print_top10(all_terms,nb_w_degree,unique_labels)
print
print "closeness:"
print_top10(all_terms,nb_closeness,unique_labels)
print
print "w_closeness:"
print_top10(all_terms,nb_w_closeness,unique_labels)

accuracy TF-IDF: 0.83648255814
accuracy degree: 0.797238372093
accuracy w_degree: 0.834302325581
accuracy closeness: 0.835755813953
accuracy w_closeness: 0.855377906977
TFIDF:
project: elsevi project utc model architectur focuss network simplifi juli program
course: fax decemb system friend wisconsin annual subscrib apach tuth object
student: abil accept mail current text engin detail larg septemb web
faculty: scienc fateman java creat philadelphia current group lab solut strand

degree:
project: simplifi utc cartoon fantasi philadelphia comput model arizona program juli
course: lab friend subscrib tuth wisconsin apach system decemb annual object
student: current text carl web detail engin septemb larg princeton theori
faculty: mail human creat current instal lab arizona solut philadelphia strand

w_degree:
project: network philadelphia elsevi instal man student comput program juli arizona
course: final lab friend lectur wisconsin august system decemb annual object
student: web arizona

Linear SVC Clf

In [31]:
svm_tfidf = svm.LinearSVC()
svm_degree = svm.LinearSVC()
svm_w_degree = svm.LinearSVC()
svm_closeness = svm.LinearSVC()
svm_w_closeness = svm.LinearSVC()

svm_tfidf.fit(training_set_tfidf,labels_array)
svm_degree.fit(training_set_degree,labels_array)
svm_w_degree.fit(training_set_w_degree,labels_array)
svm_closeness.fit(training_set_closeness, labels_array)
svm_w_closeness.fit(training_set_w_closeness, labels_array)

predictions_tfidf = svm_tfidf.predict(testing_set_tfidf)
predictions_degree = svm_degree.predict(testing_set_degree)
predictions_w_degree = svm_degree.predict(testing_set_w_degree)
predictions_closeness = svm_degree.predict(testing_set_closeness)
predictions_w_closeness = svm_w_closeness.predict(testing_set_w_closeness)

print "accuracy TF-IDF:", accuracy_score(truth_array,predictions_tfidf)
print "accuracy degree:", accuracy_score(truth_array,predictions_degree)
print "accuracy w_degree:", accuracy_score(truth_array,predictions_w_degree)
print "accuracy closeness:", accuracy_score(truth_array,predictions_closeness)
print "accuracy w_closeness:", accuracy_score(truth_array,predictions_w_closeness)

print "TFIDF:"
print_top10(all_terms,svm_tfidf,unique_labels)
print
print "degree:"
print_top10(all_terms,svm_degree,unique_labels)
print
print "w_degree:"
print_top10(all_terms,svm_w_degree,unique_labels)
print
print "closeness:"
print_top10(all_terms,svm_closeness,unique_labels)
print
print "w_closeness:"
print_top10(all_terms,svm_w_closeness,unique_labels)

accuracy TF-IDF: 0.893895348837
accuracy degree: 0.904796511628
accuracy w_degree: 0.856831395349
accuracy closeness: 0.880813953488
accuracy w_closeness: 0.909156976744
TFIDF:
project: juli optic utc read cartoon elsevi page project parallel program
course: california fax proof apach object tuth design overlap resourc berkelei
student: univers theori lectur text accept degre septemb larg intern detail
faculty: peter html polici graduat scienc symposium java lab solut strand

degree:
project: high confer group model optic utc architectur focuss juli program
course: apach fax evalu annual friend amherst system phone object tuth
student: document austin current wagner septemb proceed web text detail accept
faculty: spoken lab phone html return scienc java lab solut strand

w_degree:
project: len easi parallel page reserv program read cartoon project elsevi
course: exponenti grupen apach friend tuth electron design berkelei overlap resourc
student: secur text ask accept html princeton lar

Logistic Regression Clf

In [32]:
lgr_tfidf = LogisticRegression()
lgr_degree = LogisticRegression()
lgr_w_degree = LogisticRegression()
lgr_closeness = LogisticRegression()
lgr_w_closeness = LogisticRegression()

lgr_tfidf.fit(training_set_tfidf,labels_array)
lgr_degree.fit(training_set_degree,labels_array)
lgr_w_degree.fit(training_set_w_degree,labels_array)
lgr_closeness.fit(training_set_closeness, labels_array)
lgr_w_closeness.fit(training_set_w_closeness, labels_array)

predictions_tfidf = lgr_tfidf.predict(testing_set_tfidf)
predictions_degree = lgr_degree.predict(testing_set_degree)
predictions_w_degree = lgr_degree.predict(testing_set_w_degree)
predictions_closeness = lgr_degree.predict(testing_set_closeness)
predictions_w_closeness = lgr_w_closeness.predict(testing_set_w_closeness)

print "accuracy TF-IDF:", accuracy_score(truth_array,predictions_tfidf)
print "accuracy degree:", accuracy_score(truth_array,predictions_degree)
print "accuracy w_degree:", accuracy_score(truth_array,predictions_w_degree)
print "accuracy closeness:", accuracy_score(truth_array,predictions_closeness)
print "accuracy w_closeness:", accuracy_score(truth_array,predictions_w_closeness)

print "TFIDF:"
print_top10(all_terms,lgr_tfidf,unique_labels)
print
print "degree:"
print_top10(all_terms,lgr_degree,unique_labels)
print
print "w_degree:"
print_top10(all_terms,lgr_w_degree,unique_labels)
print
print "closeness:"
print_top10(all_terms,lgr_closeness,unique_labels)
print
print "w_closeness:"
print_top10(all_terms,lgr_w_closeness,unique_labels)

accuracy TF-IDF: 0.917151162791
accuracy degree: 0.87863372093
accuracy w_degree: 0.849563953488
accuracy closeness: 0.877906976744
accuracy w_closeness: 0.918604651163
TFIDF:
project: brad model architectur project juli optic parallel page utc program
course: overlap phone california friend proof fax apach berkelei object tuth
student: secur proceed wagner theori larg detail web accept text septemb
faculty: focuss game graduat symposium html scienc java lab solut strand

degree:
project: organ network reserv comput architectur utc focuss model juli program
course: subscrib phone wisconsin decemb friend apach system annual tuth object
student: ask spencer proceed wagner carl septemb web detail accept text
faculty: phone html spoken philadelphia return scienc java solut lab strand

w_degree:
project: architectur cartoon optic juli utc reserv elsevi project page program
course: place proof fax resourc friend apach berkelei object overlap tuth
student: intern ask librari web larg septemb 