### Library.py

In [1]:
import itertools
import igraph
import numpy as np

In [2]:
def terms_to_graph(lists_of_terms, window_size, overspanning):
    '''This function returns a directed, weighted igraph from lists of list of terms (the tokens from the pre-processed text)
    e.g., [['quick','brown','fox'], ['develop', 'remot', 'control'], etc]
    Edges are weighted based on term co-occurence within a sliding window of fixed size 'w' '''

    if overspanning:
        terms = [item for sublist in lists_of_terms for item in sublist]
    else:
        idx = 0
        terms = lists_of_terms[idx]

    from_to = {}

    while True:
        w = min(window_size, len(terms))
        # create initial complete graph (first w terms)
        terms_temp = terms[0:w]
        indexes = list(itertools.combinations(range(w), r=2))

        new_edges = []

        for my_tuple in indexes:
            new_edges.append(tuple([terms_temp[i] for i in my_tuple]))
        for new_edge in new_edges:
            if new_edge in from_to:
                from_to[new_edge] += 1
            else:
                from_to[new_edge] = 1

        # then iterate over the remaining terms
        for i in range(w, len(terms)):
            # term to consider
            considered_term = terms[i]
            # all terms within sliding window
            terms_temp = terms[(i - w + 1):(i + 1)]

            # edges to try
            candidate_edges = []
            for p in range(w - 1):
                candidate_edges.append((terms_temp[p], considered_term))

            for try_edge in candidate_edges:

                # if not self-edge
                if try_edge[1] != try_edge[0]:

                    # if edge has already been seen, update its weight
                    if try_edge in from_to:
                        from_to[try_edge] += 1

                    # if edge has never been seen, create it and assign it a unit weight
                    else:
                        from_to[try_edge] = 1

        if overspanning:
            break
        else:
            idx += 1
            if idx == len(lists_of_terms):
                break
            terms = lists_of_terms[idx]

    # create empty graph
    g = igraph.Graph(directed=True)

    # add vertices
    if overspanning:
        g.add_vertices(sorted(set(terms)))
    else:
        g.add_vertices(sorted(set([item for sublist in lists_of_terms for item in sublist])))

    # add edges, direction is preserved since the graph is directed
    g.add_edges(list(from_to.keys()))

    # set edge and vertice weights
    g.es['weight'] = list(from_to.values())  # based on co-occurence within sliding window
    g.vs['weight'] = g.strength(weights=list(from_to.values()))  # weighted degree

    return (g)
    

In [3]:
def compute_node_centrality(graph):
    # degree
    degrees = graph.degree()
    degrees = [round(float(degree)/(len(graph.vs)-1),5) for degree in degrees]

    # weighted degree
    ### fill the gap ### hint: use the .strength() method with 'weights' argument
    w_degrees = graph.strength(weights = graph.es["weight"])
    w_degrees = [round(float(degree)/(len(graph.vs)-1),5) for degree in w_degrees]

    # closeness
    ### fill the gap ### hint: use the .closeness() method with 'normalized' argument set to True
    closeness = graph.closeness(normalized = True)
    closeness = [round(value,5) for value in closeness]

    # weighted closeness
    ### fill the gap ### hint: same as above, but with 'weights' argument
    w_closeness = graph.closeness(normalized = True , weights = graph.es["weight"])
    w_closeness = [round(value,5) for value in w_closeness]

    return(list(zip(graph.vs["name"],degrees,w_degrees,closeness,w_closeness)))

In [4]:
def print_top10(feature_names, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    # coef stores the weights of each feature (in unique term), for each class
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-10:]
        print("%s: %s" % (class_label," ".join(feature_names[j] for j in top10)))

In [5]:
def print_bot10(feature_names, clf, class_labels):
    """Prints features with the lowest coefficient values, per class"""
    for i, class_label in enumerate(class_labels):
        bot10 = np.argsort(clf.coef_[i])[0:9]
        print("%s: %s" % (class_label," ".join(feature_names[j] for j in bot10)))

### Document classification :

In [6]:
import math
import numpy
import pandas as pd
# from library import terms_to_graph, compute_node_centrality, print_top10, print_bot10
from sklearn import svm, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [7]:
ls

[0m[01;34mcode[0m/  [01;34mdata[0m/  lab2_handout.pdf  MohamedRostomGHARBI_TP2.ipynb


In [8]:
##################################
# data loading and preprocessing #
##################################

path_to_data = "data"

train = pd.read_csv(path_to_data + "/webkb-train-stemmed.txt", header=None, delimiter="\t")
print(train.shape)

test = pd.read_csv(path_to_data + "/webkb-test-stemmed.txt", header=None, delimiter="\t")
print(test.shape)

# inspect head of data frames
print("first five rows of training data:")
print(train.iloc[:5,:])

print("first five rows of testing data:")
print(test.iloc[:5,:])

# get index of empty (nan) and less than four words documents (for which a graph cannot be built)
index_remove = [i for i in range(len(train.iloc[:,1])) if (train.iloc[i,1]!=train.iloc[i,1]) or ((train.iloc[i,1]==train.iloc[i,1])and(len(train.iloc[i,1].split(" "))<4))]

# remove those documents
print("removing", len(index_remove), "documents from training set")
train = train.drop(train.index[index_remove])
print(train.shape)

# repeat above steps for test set
index_remove = [i for i in range(len(test.iloc[:,1])) if (test.iloc[i,1]!=test.iloc[i,1]) or ((test.iloc[i,1]==test.iloc[i,1])and(len(test.iloc[i,1].split(" "))<4))]
print("removing", len(index_remove), "documents from test set")
test = test.drop(test.index[index_remove])
print(test.shape)

labels = train.iloc[:,0]
unique_labels = list(set(labels))

truth = test.iloc[:,0]
unique_truth = list(set(truth))

print("number of observations per class:")
for label in unique_labels:
    print(label, ":", len([temp for temp in labels if temp==label]))

print("storing terms from training documents as list of lists")
terms_by_doc = np.array([document.split(" ") for document in train.iloc[:,1]])
n_terms_per_doc = [len(terms) for terms in terms_by_doc]

print("storing terms from test documents as list of lists")
terms_by_doc_test = np.array([document.split(" ") for document in test.iloc[:,1]])

print("min, max and average number of terms per document:", min(n_terms_per_doc), max(n_terms_per_doc), sum(n_terms_per_doc)/len(n_terms_per_doc))

(2803, 2)
(1396, 2)
first five rows of training data:
         0                                                  1
0  student  brian comput scienc depart univers wisconsin d...
1  student  denni swanson web page mail pop uki offic hour...
2  faculty  russel impagliazzo depart comput scienc engin ...
3  student  dave phd student depart comput scienc univers ...
4  project  center lifelong learn design univers colorado ...
first five rows of testing data:
         0                                                  1
0  student  eric homepag eric wei tsinghua physic fudan genet
1   course  comput system perform evalu model new sept ass...
2  student  home page comput scienc grad student ucsd work...
3  student  toni web page toni face thing call toni studen...
4   course  ec advanc comput architectur credit parallel a...
removing 29 documents from training set
(2774, 2)
removing 20 documents from test set
(1376, 2)
number of observations per class:
project : 335
student : 1075
course : 6

In [9]:
# store all terms in list
all_terms = [terms_by_doc[i][j] for i in range(len(terms_by_doc)) for j in range(len(terms_by_doc[i]))]

### fill the gap ### hint: flatten 'terms_by_doc' (you may use a list comprehension)

# compute average number of terms
avg_len = sum(n_terms_per_doc)/len(n_terms_per_doc)

# unique terms
all_unique_terms = list(set(all_terms))
# store IDF values in dictionary
terms_by_doc_sets = [set(elt) for elt in terms_by_doc]
n_doc = len(labels)
idf = dict(zip(all_unique_terms,[0]*len(all_unique_terms)))

In [10]:
idf

{'tba': 0,
 'autom': 0,
 'bernardino': 0,
 'racquetbal': 0,
 'church': 0,
 'destin': 0,
 'tau': 0,
 'ow': 0,
 'bernard': 0,
 'cow': 0,
 'manner': 0,
 'semest': 0,
 'webcrawl': 0,
 'crap': 0,
 'keeper': 0,
 'psychologist': 0,
 'adopt': 0,
 'size': 0,
 'moment': 0,
 'biswa': 0,
 'smithsonian': 0,
 'teer': 0,
 'andrea': 0,
 'tell': 0,
 'microsystem': 0,
 'ignit': 0,
 'allevi': 0,
 'hodg': 0,
 'gouda': 0,
 'bitmap': 0,
 'identif': 0,
 'plug': 0,
 'dozen': 0,
 'horror': 0,
 'harsh': 0,
 'march': 0,
 'ic': 0,
 'drawer': 0,
 'girl': 0,
 'multidimension': 0,
 'enforc': 0,
 'sigir': 0,
 'jolla': 0,
 'etzioni': 0,
 'suif': 0,
 'brief': 0,
 'beam': 0,
 'stop': 0,
 'slide': 0,
 'openstep': 0,
 'market': 0,
 'concret': 0,
 'automobil': 0,
 'umd': 0,
 'reactor': 0,
 'auto': 0,
 'haa': 0,
 'written': 0,
 'fanci': 0,
 'poll': 0,
 'mo': 0,
 'tick': 0,
 'focus': 0,
 'hamilton': 0,
 'hypothesi': 0,
 'isl': 0,
 'chanc': 0,
 'routin': 0,
 'softbot': 0,
 'overlap': 0,
 'manuel': 0,
 'smallest': 0,
 'porter'

In [11]:
for counter,unique_term in enumerate(list(idf.keys())):
    # compute number of documents in which 'unique_term' appears
    df =np.sum(np.array([unique_term in terms for terms in terms_by_doc_sets ])) ### fill the gap # 
    idf[unique_term] = math.log10((len(all_unique_terms)+1)/df)
    if counter % 1e3 == 0:
        print(counter, "terms processed")

0 terms processed
1000 terms processed
2000 terms processed
3000 terms processed
4000 terms processed
5000 terms processed
6000 terms processed
7000 terms processed


In [12]:
###########################################
# computing features for the training set #
###########################################

w = 3 # sliding window size

print("creating a graph-of-words for the collection")

c_g = terms_to_graph(terms_by_doc , w , overspanning=False)### fill the gap ### hint: use the terms_to_graph function with the proper arguments

# sanity check (should return True)
print(len(all_unique_terms) == len(c_g.vs))

print("creating a graph-of-words for each training document")

all_graphs = []
for elt in terms_by_doc:
    all_graphs.append(terms_to_graph([elt],w,overspanning=True))

# sanity checks (should return True)
print(len(terms_by_doc)==len(all_graphs))
print(len(set(terms_by_doc[0]))==len(all_graphs[0].vs))

print("computing vector representations of each training document")

b = 0.003

features_degree = []
features_w_degree = []
features_closeness = []
features_w_closeness = []
features_twicw = [] # we try it only with unweighted degree
features_tfidf = []

len_all = len(all_unique_terms)
collection_degrees = collection_degrees = dict(zip(c_g.vs['name'] , c_g.strength())) ### fill the gap ### hint: build a dict where the keys are the names of the nodes in the collection graph and the values are their unweighted degrees


maxcol = max(list(collection_degrees.values()))

for i, graph in enumerate(all_graphs):
    
    terms_in_doc = terms_by_doc[i]
    doc_len = len(terms_in_doc)
    
    # returns node (0) name, (1) degree, (2) weighted degree, (3) closeness, (4) weighted closeness
    my_metrics = compute_node_centrality(graph)
    
    feature_row_degree = [0]*len_all
    feature_row_w_degree = [0]*len_all
    feature_row_closeness = [0]*len_all
    feature_row_w_closeness = [0]*len_all
    feature_row_twicw = [0]*len_all
    feature_row_tfidf = [0]*len_all
    
    # iterate over the unique terms contained by the doc (for all the other columns, the values will remain at zero)
    for term in list(set(terms_in_doc)):
        
        index = all_unique_terms.index(term)
        idf_term = idf[term]
        denominator = (1-b+(b*(float(doc_len)/avg_len))) ### fill the gap ### hint: refer to the TF equation in the handout
        metrics_term = [tuple[1:] for tuple in my_metrics if tuple[0]==term][0]
        
        # store TW-IDF values
        feature_row_degree[index] = (metrics_term[0]/denominator) * idf_term
        feature_row_w_degree[index] = (metrics_term[1]/denominator) * idf_term
        feature_row_closeness[index] = (metrics_term[2]/denominator) * idf_term
        feature_row_w_closeness[index] = (metrics_term[3]/denominator) * idf_term
        
        # store TW-ICW values
        feature_row_twicw[index] = (metrics_term[0]/denominator) * math.log10((maxcol+1)/collection_degrees[term]) 
        
        # number of occurences of word in document
        tf = terms_in_doc.count(term)        
        # store TF-IDF value
        feature_row_tfidf[index] = ((1+math.log1p(1+math.log1p(tf)))/(1-0.2+(0.2*(float(doc_len)/avg_len)))) * idf_term
    
    features_degree.append(feature_row_degree)
    features_w_degree.append(feature_row_w_degree)
    features_closeness.append(feature_row_closeness)
    features_w_closeness.append(feature_row_w_closeness)
    features_twicw.append(feature_row_twicw)
    features_tfidf.append(feature_row_tfidf)

    if i % 1000 == 0:
        print (i, "documents processed")

# convert list of lists into array
# documents as rows, unique words (features) as columns
training_set_degree = numpy.array(features_degree)
training_set_w_degree = numpy.array(features_w_degree)
training_set_closeness = numpy.array(features_closeness)
training_set_w_closeness = numpy.array(features_w_closeness)
training_set_tw_icw = numpy.array(features_twicw)
training_set_tfidf = numpy.array(features_tfidf)

creating a graph-of-words for the collection
True
creating a graph-of-words for each training document
True
True
computing vector representations of each training document
0 documents processed
1000 documents processed
2000 documents processed


In [13]:
#######################################
# computing features for the test set #
#######################################

print("creating a graph-of-words for each test document")

all_graphs_test = []
for elt in terms_by_doc_test:
    all_graphs_test.append(terms_to_graph([elt],w,overspanning=True))

# sanity checks (should return True)
print(len(terms_by_doc_test)==len(all_graphs_test))
print(len(set(terms_by_doc_test[0]))==len(all_graphs_test[0].vs))

print("computing vector representations of each test document")
# ! each test document is represented in the training space only

features_degree_test = []
features_w_degree_test = []
features_closeness_test = []
features_w_closeness_test = []
features_twicw_test = []
features_tfidf_test = []

for i, graph in enumerate(all_graphs_test):
    
    # filter out the terms that are not in the training set
    terms_in_doc = [term for term in terms_by_doc_test[i] if term in all_unique_terms]
    doc_len = len(terms_in_doc)
    
    my_metrics = compute_node_centrality(graph)
    
    feature_row_degree_test = [0]*len_all
    feature_row_w_degree_test = [0]*len_all
    feature_row_closeness_test = [0]*len_all
    feature_row_w_closeness_test = [0]*len_all
    feature_row_twicw_test = [0]*len_all
    feature_row_tfidf_test = [0]*len_all

    for term in list(set(terms_in_doc)):
        index = all_unique_terms.index(term)
        idf_term = idf[term]
        denominator = (1-b+(b*(float(doc_len)/avg_len)))
        metrics_term = [tuple[1:] for tuple in my_metrics if tuple[0]==term][0]
        
        # store TW-IDF values      
        feature_row_degree_test[index] = (metrics_term[0]/denominator) * idf_term
        feature_row_w_degree_test[index] = (metrics_term[1]/denominator) * idf_term
        feature_row_closeness_test[index] = (metrics_term[2]/denominator) * idf_term
        feature_row_w_closeness_test[index] = (metrics_term[3]/denominator) * idf_term
        
        # store TW-ICW values
        feature_row_twicw_test[index] = (metrics_term[0]/denominator) * (math.log10((maxcol+1)/collection_degrees[term]))

        # number of occurences of word in document
        tf = terms_in_doc.count(term)
        # store TF-IDF value
        feature_row_tfidf_test[index] = ((1+math.log1p(1+math.log1p(tf)))/(1-0.2+(0.2*(float(doc_len)/avg_len)))) * idf_term

    features_degree_test.append(feature_row_degree_test)
    features_w_degree_test.append(feature_row_w_degree_test)
    features_closeness_test.append(feature_row_closeness_test)
    features_w_closeness_test.append(feature_row_w_closeness_test)
    features_twicw_test.append(feature_row_twicw_test)
    features_tfidf_test.append(feature_row_tfidf_test)
    
    if i % 500 == 0:
        print (i, "documents processed")

# convert list of lists into array
# documents as rows, unique words as columns (i.e., document-term matrix)
testing_set_degree = numpy.array(features_degree_test)
testing_set_w_degree = numpy.array(features_w_degree_test)
testing_set_closeness = numpy.array(features_closeness_test)
testing_set_w_closeness = numpy.array(features_w_closeness_test)
testing_set_twicw = numpy.array(features_twicw_test)
testing_set_tfidf = numpy.array(features_tfidf_test)

creating a graph-of-words for each test document
True
True
computing vector representations of each test document
0 documents processed
500 documents processed
1000 documents processed


In [14]:
##########
# labels #
##########

# convert labels into integers then into column array
labels = list(labels)
labels_int = [0] * len(labels)
for j in range(len(unique_labels)):
    index_temp = [i for i in range(len(labels)) if labels[i]==unique_labels[j]]
    for element in index_temp:
        labels_int[element] = j
        
# convert truth into integers then into column array
truth = list(truth)
truth_int = [0] * len(truth)
for j in range(len(unique_truth)):
    index_temp = [i for i in range(len(truth)) if truth[i]==unique_truth[j]]
    for element in index_temp:
        truth_int[element] = j

# check that coding went smoothly
print(list(zip(truth_int,truth))[:20])

truth_array = numpy.array(truth_int)

# check that coding went smoothly
print(list(zip(labels_int,labels))[:20])
labels_array = numpy.array(labels_int)

for clf in ["LinearSVC","LogisticRegression","MultinomialNB"]:
    
    if clf=="LinearSVC":
        classifier_degree = svm.LinearSVC()
        classifier_w_degree = svm.LinearSVC()
        classifier_closeness = svm.LinearSVC()
        classifier_w_closeness = svm.LinearSVC()
        classifier_twicw = svm.LinearSVC()
        classifier_tfidf = svm.LinearSVC()
    elif clf=="LogisticRegression":
        classifier_degree = LogisticRegression(multi_class='ovr',solver='liblinear') # we specify multi_class and solver arguments just to avoid getting a warning
        classifier_w_degree = LogisticRegression(multi_class='ovr',solver='liblinear')
        classifier_closeness = LogisticRegression(multi_class='ovr',solver='liblinear')
        classifier_w_closeness = LogisticRegression(multi_class='ovr',solver='liblinear')
        classifier_twicw = LogisticRegression(multi_class='ovr',solver='liblinear')
        classifier_tfidf = LogisticRegression(multi_class='ovr',solver='liblinear')
    elif clf=="MultinomialNB":
        classifier_degree = MultinomialNB()
        classifier_w_degree = MultinomialNB()
        classifier_closeness = MultinomialNB()
        classifier_w_closeness = MultinomialNB()
        classifier_twicw = MultinomialNB()
        classifier_tfidf = MultinomialNB()

[(1, 'student'), (2, 'course'), (1, 'student'), (1, 'student'), (2, 'course'), (3, 'faculty'), (2, 'course'), (1, 'student'), (3, 'faculty'), (0, 'project'), (1, 'student'), (3, 'faculty'), (1, 'student'), (2, 'course'), (3, 'faculty'), (2, 'course'), (2, 'course'), (1, 'student'), (1, 'student'), (0, 'project')]
[(1, 'student'), (1, 'student'), (3, 'faculty'), (1, 'student'), (0, 'project'), (3, 'faculty'), (3, 'faculty'), (3, 'faculty'), (1, 'student'), (2, 'course'), (1, 'student'), (3, 'faculty'), (1, 'student'), (1, 'student'), (3, 'faculty'), (3, 'faculty'), (3, 'faculty'), (1, 'student'), (3, 'faculty'), (1, 'student')]


In [15]:
    
    ############
    # training #
    ############
    
    print("training", clf, "classifiers")
    classifier_degree.fit(training_set_degree, labels_array)
    classifier_w_degree.fit(training_set_w_degree, labels_array)
    classifier_closeness.fit(training_set_closeness, labels_array)
    classifier_w_closeness.fit(training_set_w_closeness, labels_array)
    classifier_twicw.fit(training_set_tw_icw, labels_array)
    classifier_tfidf.fit(training_set_tfidf, labels_array)
    

training MultinomialNB classifiers


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
    ###########
    # testing #
    ###########
    
    # issue predictions
    predictions_degree = classifier_degree.predict(testing_set_degree)
    predictions_w_degree = classifier_w_degree.predict(testing_set_w_degree)
    predictions_closeness = classifier_closeness.predict(testing_set_closeness)
    predictions_w_closeness = classifier_w_closeness.predict(testing_set_w_closeness)
    predictions_twicw = classifier_twicw.predict(testing_set_twicw)
    predictions_tfidf = classifier_tfidf.predict(testing_set_tfidf)
    
    print('========== accuracy for', clf ,'classifier ==========')
    print("accuracy TW-IDF degree:", round(metrics.accuracy_score(truth_array,predictions_degree)*100,3))
    print("accuracy TW-IDF weighted degree:", round(metrics.accuracy_score(truth_array,predictions_w_degree)*100,3))
    print("accuracy TW-IDF closeness:", round(metrics.accuracy_score(truth_array,predictions_closeness)*100,3))
    print("accuracy TW-IDF weighted closeness:", round(metrics.accuracy_score(truth_array,predictions_w_closeness)*100,3))
    print("accuracy TW-ICW degree:", round(metrics.accuracy_score(truth_array,predictions_twicw)*100,3))
    print("accuracy TF-IDF:", round(metrics.accuracy_score(truth_array,predictions_tfidf)*100,3))
    
# show the most and less important features for each class


### fill the gaps ### hint: pick a classifier (e.g., 'classifier_tfidf'), and pass it to the 'print_top10' and 'print_bot10' functions along with 'unique_labels' and 'all_unique_terms'
print_top10(all_unique_terms , classifier_tfidf , unique_labels)

accuracy TW-IDF degree: 81.25
accuracy TW-IDF weighted degree: 81.395
accuracy TW-IDF closeness: 85.974
accuracy TW-IDF weighted closeness: 86.337
accuracy TW-ICW degree: 78.125
accuracy TF-IDF: 85.174
project: laboratori support faculti inform comput develop system group research project
student: work graduat interest home depart page student comput scienc univers
course: note lectur grade exam homework hour class syllabu instructor assign
faculty: public depart associ interest scienc univers comput fax research professor
