## <span style="color:navy">Alternative 1 : Classifying articles using community detection citation graph <span>


### **Loading the packages we are going to use.**


In [1]:
import networkx as nx
import numpy as np
import csv
from networkx.algorithms.community.label_propagation import label_propagation_communities
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

### **Community detection using label propagation --> cluster creation**


In [2]:
def community_detection(G):
    n_clusters = 0
    cluster_assignments = list()
    communities = label_propagation_communities(G)
    for com in communities:
        n_clusters +=1
        cluster_assignments.append(list(com))
    cluster_assignments = dict(enumerate(cluster_assignments))
    return n_clusters, cluster_assignments

### **Creating the citation graph.**

In [3]:
# Create a directed graph
G = nx.read_edgelist('Cit-HepTh.txt', delimiter='\t', create_using=nx.DiGraph())

In [4]:
print("Nodes: ", G.number_of_nodes())
print("Edges: ", G.number_of_edges())

Nodes:  27770
Edges:  352807


### **Community detection and cluster creation .**

In [5]:
# Perform community detection
n_clusters1, cluster_assignments = community_detection(G.to_undirected())

### **Reading data (document ids and the corresponding journal they were published in).**

### ** We have 28 journals **

In [6]:
# Read training data
all_ids = list()
y_all = list()
with open('train.csv', 'r') as f:
    next(f)
    for line in f:
        t = line.split(',')
        all_ids.append(t[0])
        y_all.append(t[1][:-1])
n_all = len(all_ids)
unique = np.unique(y_all)
print("\nNumber of classes: ", unique.size)


Number of classes:  28


### **Splitting in train and validation set in order to run my own evaluation before uploading the results on kaggle**

In [7]:
train_ids,valid_ids, y_train, y_val = train_test_split(all_ids, y_all, test_size=0.2, random_state=1)

In [8]:
n_train = len(train_ids)
n_valid = len(valid_ids)
print("Articles used for training :",len(train_ids))
print("Articles used for validation :",len(valid_ids))

Articles used for training : 12272
Articles used for validation : 3069


### **Creating cluster dictionary in order to use article ids as keys**

In [9]:
reverse_clusters = {tuple(v) : k for k, v in cluster_assignments.items()}

### **Create the training, validation and test matrices. Each row corresponds to an article.**

Use the following 4 features for each article:
 - out-degree of node
 - in-degree of node
 - average degree of neighborhood of node
 - a vector that serves as an indicator of the **Community Detection **cluster each node is assigned to

In [10]:
train_avg_neig_deg = nx.average_neighbor_degree(G, nodes=train_ids)
valid_avg_neig_deg =nx.average_neighbor_degree(G, nodes=valid_ids)
X_train = np.zeros((n_train, 3+n_clusters1))
X_valid = np.zeros((n_valid, 3+n_clusters1))
for i in range(n_train):
    X_train[i,0] = G.out_degree(train_ids[i])
    X_train[i,1] = G.in_degree(train_ids[i])
    X_train[i,2] = train_avg_neig_deg[train_ids[i]]
    for list_ids in reverse_clusters.keys():
        if(train_ids[i] in list_ids):
            X_train[i,reverse_clusters[list_ids]+3] = 1
            break

for i in range(n_valid):
    X_valid[i,0] = G.out_degree(valid_ids[i])
    X_valid[i,1] = G.in_degree(valid_ids[i])
    X_valid[i,2] = valid_avg_neig_deg[valid_ids[i]]
    for list_ids in reverse_clusters.keys():
        if(valid_ids[i] in list_ids):
            X_valid[i,reverse_clusters[list_ids]+3] = 1
            break

In [11]:
# Read test data
test_ids = list()
with open('test.csv', 'r') as f:
    next(f)
    for line in f:
        test_ids.append(line[:-2])

In [12]:
# Create the test matrix
n_test = len(test_ids)
avg_neig_deg = nx.average_neighbor_degree(G, nodes=test_ids)
X_test = np.zeros((n_test, 3+n_clusters1))
for i in range(n_test):
    X_test[i,0] = G.out_degree(test_ids[i])
    X_test[i,1] = G.in_degree(test_ids[i])
    X_test[i,2] = avg_neig_deg[test_ids[i]]
    for list_ids in reverse_clusters.keys():
        if(test_ids[i] in list_ids):
            X_test[i,reverse_clusters[list_ids]+3] = 1
            break

In [13]:
print("Train matrix dimensionality: ", X_train.shape)
print("Valid matrix dimensionality: ", X_valid.shape)
print("Test matrix dimensionality: ", X_test.shape)

Train matrix dimensionality:  (12272, 1138)
Valid matrix dimensionality:  (3069, 1138)
Test matrix dimensionality:  (3836, 1138)


### **Logistic regression classifier to classify the articles of the validation set **

In [14]:
# Use logistic regression to classify the articles of the test set
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_valid)
loss = log_loss(y_val,y_pred)
print(loss)

2.30051261713


## <span style="color:navy"> Alternative 2 : Classifying articles using Community Louvain <span>

### ** Importing corresponding package and generating the clusters out of the best partition of the Graph**

In [15]:
import community_louvain
partition = community_louvain.best_partition(G.to_undirected())

In [16]:
n_clusters2 = len(np.unique(list(partition.values())))

### **Create the training, validation and test matrices. Each row corresponds to an article.**

Use the following 4 features for each article:
 - out-degree of node
 - in-degree of node
 - average degree of neighborhood of node
 - a vector that serves as an indicator of the **Community-Louvain** cluster each node is assigned to

In [17]:
train_avg_neig_deg = nx.average_neighbor_degree(G, nodes=train_ids)
X_train = np.zeros((n_train, 3+n_clusters2))
for i in range(n_train):
    X_train[i,0] = G.out_degree(train_ids[i])
    X_train[i,1] = G.in_degree(train_ids[i])
    X_train[i,2] = train_avg_neig_deg[train_ids[i]]
    X_train[i,partition[train_ids[i]]+ 3] = 1

In [18]:
valid_avg_neig_deg = nx.average_neighbor_degree(G, nodes=valid_ids)
X_valid = np.zeros((n_valid, 3+n_clusters2))
for i in range(n_valid):
    X_valid[i,0] = G.out_degree(valid_ids[i])
    X_valid[i,1] = G.in_degree(valid_ids[i])
    X_valid[i,2] = valid_avg_neig_deg[valid_ids[i]]
    X_valid[i,partition[valid_ids[i]]+ 3] = 1

In [19]:
# Create the test matrix
test_avg_neig_deg = nx.average_neighbor_degree(G, nodes=test_ids)
X_test = np.zeros((n_test, 3+n_clusters2))
for i in range(n_test):
    X_test[i,0] = G.out_degree(test_ids[i])
    X_test[i,1] = G.in_degree(test_ids[i])
    X_test[i,2] = test_avg_neig_deg[test_ids[i]]
    X_test[i,partition[test_ids[i]]+ 3] = 1

In [20]:
print("Train matrix dimensionality: ", X_train.shape)
print("Valid matrix dimensionality: ", X_valid.shape)
print("Test matrix dimensionality: ", X_test.shape)

Train matrix dimensionality:  (12272, 180)
Valid matrix dimensionality:  (3069, 180)
Test matrix dimensionality:  (3836, 180)


### **Logistic regression classifier to classify the articles of the validation set **

In [21]:
# Use logistic regression to classify the articles of the test set
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_valid)
loss = log_loss(y_val,y_pred)
print(loss)

2.32631069591


## <span style="color:navy"> Alternative 3 : Classifying articles using both Community Detection and Community Louvain <span>

### **Create the training, validation and test matrices. Each row corresponds to an article.**

Use the following 5 features for each article:
 - out-degree of node
 - in-degree of node
 - average degree of neighborhood of node
 - a vector that serves as an indicator of the **Community-Detection** cluster each node is assigned to
 - a vector that serves as an indicator of the **Community-Louvain** cluster each node is assigned to

In [22]:
train_avg_neig_deg = nx.average_neighbor_degree(G, nodes=train_ids)
X_train = np.zeros((n_train, 3+n_clusters1+n_clusters2))
for i in range(n_train):
    X_train[i,0] = G.out_degree(train_ids[i])
    X_train[i,1] = G.in_degree(train_ids[i])
    X_train[i,2] = train_avg_neig_deg[train_ids[i]]
    X_train[i,partition[train_ids[i]]+ 3] = 1
    for list_ids in reverse_clusters.keys():
        if(train_ids[i] in list_ids):
            X_train[i,reverse_clusters[list_ids]+3+n_clusters2] = 1
            break

In [23]:
valid_avg_neig_deg = nx.average_neighbor_degree(G, nodes=valid_ids)
X_valid = np.zeros((n_valid, 3+n_clusters2+n_clusters1))
for i in range(n_valid):
    X_valid[i,0] = G.out_degree(valid_ids[i])
    X_valid[i,1] = G.in_degree(valid_ids[i])
    X_valid[i,2] = valid_avg_neig_deg[valid_ids[i]]
    X_valid[i,partition[valid_ids[i]]+ 3] = 1
    for list_ids in reverse_clusters.keys():
        if(valid_ids[i] in list_ids):
            X_valid[i,reverse_clusters[list_ids]+3+n_clusters2] = 1
            break

In [24]:
# Create the test matrix
test_avg_neig_deg = nx.average_neighbor_degree(G, nodes=test_ids)
X_test = np.zeros((n_test,  3+n_clusters2+n_clusters1))
for i in range(n_test):
    X_test[i,0] = G.out_degree(test_ids[i])
    X_test[i,1] = G.in_degree(test_ids[i])
    X_test[i,2] = test_avg_neig_deg[test_ids[i]]
    X_test[i,partition[test_ids[i]]+ 3] = 1
    for list_ids in reverse_clusters.keys():
        if(test_ids[i] in list_ids):
            X_test[i,reverse_clusters[list_ids]+3+n_clusters2] = 1
            break

In [25]:
print("Train matrix dimensionality: ", X_train.shape)
print("Valid matrix dimensionality: ", X_valid.shape)
print("Test matrix dimensionality: ", X_test.shape)

Train matrix dimensionality:  (12272, 1315)
Valid matrix dimensionality:  (3069, 1315)
Test matrix dimensionality:  (3836, 1315)


### **Logistic regression classifier to classify the articles of the validation set **


In [26]:
# Use logistic regression to classify the articles of the test set
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_valid)
loss = log_loss(y_val,y_pred)
print(loss)

2.2732457279


 | **Community Detection**| **Community Louvain**  | **Both**  |
 | :-------------:| :-----: :---------: |
 |  2.30051261713 |  2.32631069591 | **2.2732457279 **|

In [27]:
y_pred = clf.predict_proba(X_test)
# Write predictions to a file
with open('graph_community_detection_louvain.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = clf.classes_.tolist()
    lst.insert(0, "Article")
    writer.writerow(lst)
    for i,test_id in enumerate(test_ids):
        lst = y_pred[i,:].tolist()
        lst.insert(0, test_id)
        writer.writerow(lst)

### <span style="color:navy"> Kaggle evaluation : 2.20234<span>

## <span style="color:navy"> Alternative 4 : Classifying articles using Community Detection/Louvain/Status<span>

In [28]:
import community_status

### **Instantiating and initializing community Status **


In [30]:
status = community_status.Status()

In [38]:
status.init(G,1)

### **Create the training, validation and test matrices. Each row corresponds to an article.**

Use the following 6 features for each article:
 - out-degree of node
 - in-degree of node
 - average degree of neighborhood of node
 - community status degree
 - a vector that serves as an indicator of the **Community-Detection** cluster each node is assigned to
 - a vector that serves as an indicator of the **Community-Louvain** cluster each node is assigned to

In [64]:
train_avg_neig_deg = nx.average_neighbor_degree(G, nodes=train_ids)
X_train = np.zeros((n_train, 4+n_clusters1+n_clusters2))
for i in range(n_train):
    X_train[i,0] = G.out_degree(train_ids[i])
    X_train[i,1] = G.in_degree(train_ids[i])
    X_train[i,2] = train_avg_neig_deg[train_ids[i]]
    X_train[i,3] = status.gdegrees[train_ids[i]]
    for list_ids in reverse_clusters.keys():
        if(train_ids[i] in list_ids):
            X_train[i,reverse_clusters[list_ids]+3+n_clusters2] = 1
            break
            
alid_avg_neig_deg = nx.average_neighbor_degree(G, nodes=valid_ids)
X_valid = np.zeros((n_valid, 4+n_clusters1+n_clusters2))
for i in range(n_valid):
    X_valid[i,0] = G.out_degree(valid_ids[i])
    X_valid[i,1] = G.in_degree(valid_ids[i])
    X_valid[i,2] = valid_avg_neig_deg[valid_ids[i]]
    X_valid[i,3] = status.gdegrees[valid_ids[i]]
    for list_ids in reverse_clusters.keys():
        if(valid_ids[i] in list_ids):
            X_valid[i,reverse_clusters[list_ids]+3+n_clusters2] = 1
            break
            
test_avg_neig_deg = nx.average_neighbor_degree(G, nodes=test_ids)
X_test = np.zeros((n_test, 4+n_clusters1+n_clusters2))
for i in range(n_test):
    X_test[i,0] = G.out_degree(test_ids[i])
    X_test[i,1] = G.in_degree(test_ids[i])
    X_test[i,2] = test_avg_neig_deg[test_ids[i]]
    X_test[i,3] = status.gdegrees[test_ids[i]]
    for list_ids in reverse_clusters.keys():
        if(test_ids[i] in list_ids):
            X_test[i,reverse_clusters[list_ids]+3+n_clusters2] = 1
            break

In [65]:
print("Train matrix dimensionality: ", X_train.shape)
print("Valid matrix dimensionality: ", X_valid.shape)
print("Test matrix dimensionality: ", X_test.shape)

Train matrix dimensionality:  (12272, 1316)
Valid matrix dimensionality:  (3069, 1316)
Test matrix dimensionality:  (3836, 1316)


In [66]:
# Use logistic regression to classify the articles of the test set
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_valid)
loss = log_loss(y_val,y_pred)
print(loss)

2.30057382345


 ### Final Results
 | **Community Detection**| **Community Louvain**  | **Both**  |   ** Plus Community Status**
 | :-------------:| :-----: :---------: |
 |  2.30051261713 |  2.32631069591 | **2.2732457279 **|  2.30057382345 |