## <span style="color:navy">Alternative 1 : Classifying articles using in, out and average degree <span>

### **Loading the packages we are going to use.**

In [59]:
import networkx as nx
import numpy as np
import csv
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

### **Creating the citation graph.**

In [60]:
# Create a directed graph
G = nx.read_edgelist('Cit-HepTh.txt', delimiter='\t', create_using=nx.DiGraph())

In [61]:
print("Nodes: ", G.number_of_nodes())
print("Edges: ", G.number_of_edges())

Nodes:  27770
Edges:  352807


### **Reading data (document ids and the corresponding journal they were published in).**

### ** We have 28 journals **

In [62]:
# Read training data
all_ids = list()
y_all = list()
with open('train.csv', 'r') as f:
    next(f)
    for line in f:
        t = line.split(',')
        all_ids.append(t[0])
        y_all.append(t[1][:-1])
n_all = len(all_ids)
unique = np.unique(y_all)
print("\nNumber of classes: ", unique.size)


Number of classes:  28


### **Splitting in train and validation set in order to run my own evaluation before uploading the results on kaggle**

In [63]:
train_ids,valid_ids, y_train, y_val = train_test_split(all_ids, y_all, test_size=0.2, random_state=1)

In [64]:
n_train = len(train_ids)
n_valid = len(valid_ids)
print("Articles used for training :",len(train_ids))
print("Articles used for validation :",len(valid_ids))

Articles used for training : 12272
Articles used for validation : 3069


### **Create the training, validation and test matrices. Each row corresponds to an article.**

Use the following 3 features for each article:
 - out-degree of node
 - in-degree of node
 - average degree of neighborhood of node

In [65]:
train_avg_neig_deg = nx.average_neighbor_degree(G, nodes=train_ids)
valid_avg_neig_deg = nx.average_neighbor_degree(G, nodes=valid_ids)

In [159]:
X_train = np.zeros((n_train, 3))
for i in range(n_train):
	X_train[i,0] = G.out_degree(train_ids[i])
	X_train[i,1] = G.in_degree(train_ids[i])
	X_train[i,2] = train_avg_neig_deg[train_ids[i]]

In [160]:
X_valid = np.zeros((n_valid, 3))
for i in range(n_valid):
	X_valid[i,0] = G.out_degree(valid_ids[i])
	X_valid[i,1] = G.in_degree(valid_ids[i])
	X_valid[i,2] = valid_avg_neig_deg[valid_ids[i]]

In [69]:
# Read test data
test_ids = list()
with open('test.csv', 'r') as f:
    next(f)
    for line in f:
        test_ids.append(line[:-2])

In [161]:
# Create the test matrix. Use the same 3 features as above
n_test = len(test_ids)
test_avg_neig_deg = nx.average_neighbor_degree(G, nodes=test_ids)
X_test = np.zeros((n_test, 3))
for i in range(n_test):
	X_test[i,0] = G.out_degree(test_ids[i])
	X_test[i,1] = G.in_degree(test_ids[i])
	X_test[i,2] = test_avg_neig_deg[test_ids[i]]

In [71]:
print("Train matrix dimensionality: ", X_train.shape)
print("Validation matrix dimensionality: ", X_valid.shape)
print("Test matrix dimensionality: ", X_test.shape)

Train matrix dimensionality:  (12272, 3)
Validation matrix dimensionality:  (3069, 3)
Test matrix dimensionality:  (3836, 3)


### **Logistic regression classifier to classify the articles of the validation set **

In [72]:
# Use logistic regression to classify the articles of the test set
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_valid)
loss = log_loss(y_val,y_pred)
print(loss)

2.4083371584


## <span style="color:navy">Alternative 2 : <span>
  ###  <span style="color:navy">Classifying articles using in, out, average degree, closeness centrality . <span>

### **Computing closeness centrality of every article that belongs in train, validation and test sets **

In [82]:
train_cc = np.zeros(n_train)
for i in range(0,n_train):
    train_cc[i] = nx.closeness_centrality(G,train_ids[i])

In [84]:
valid_cc = np.zeros(n_valid)
for i in range(0,n_valid):
    valid_cc[i] = nx.closeness_centrality(G,valid_ids[i])

In [85]:
test_cc = np.zeros(n_test)
for i in range(0,n_test):
    test_cc[i] = nx.closeness_centrality(G,test_ids[i])

### **Computing betweeness centrality of every article that belongs in train, validation and test sets **

### ** Appending the new features to train,validation and test matrices **

In [153]:
train_cc2 = list()
valid_cc2 = list()
test_cc2 = list()

for i in range(0,len(train_cc)):
    train_cc2.append([train_cc[i]])
                       
for i in range(0,len(valid_cc)):
    valid_cc2.append([valid_cc[i]])
                       
for i in range(0,len(test_cc)):
    test_cc2.append([test_cc[i]])

In [162]:
X_train = np.append(X_train,np.array(train_cc2),axis = 1)
X_valid = np.append(X_valid,np.array(valid_cc2),axis = 1)
X_test = np.append(X_test,np.array(test_cc2),axis = 1)

### **Logistic regression classifier to classify the articles of the validation set **

In [164]:
# Use logistic regression to classify the articles of the test set
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_valid)
loss = log_loss(y_val,y_pred)
print(loss)

2.39597954685


 | **First alternative**| **Second Alternatve**  |
 | :-------------:| :-----: |
 |  2.4083371584  | **2.39597954685**|

In [165]:
y_pred = clf.predict_proba(X_test)

In [166]:
# Write predictions to a file
with open('graph_baseline_results.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = clf.classes_.tolist()
    lst.insert(0, "Article")
    writer.writerow(lst)
    for i,test_id in enumerate(test_ids):
        lst = y_pred[i,:].tolist()
        lst.insert(0, test_id)
        writer.writerow(lst)

### <span style="color:navy"> Kaggle evaluation : 2.33636<span>