## <span style="color:navy">Alternative 1 : Classifying articles using Convolutional Neural Networks <span>

### **Loading the packages we are going to use.**

In [4]:
import numpy as np
import pandas as pd
import csv
from utils import load_embeddings, preprocessing, get_vocab, add_unknown_words, create_train_test_loaders
from model import CNN
import torch 
import torch.nn as nn
import torch.utils.data as utils
from torch.autograd import Variable



### **Function that returns the number of words from the longest abstract..**

In [5]:
def compute_max_length(abstracts):
    max_length = max([len(abstracts[i]) for i in range(0,len(abstracts))])
    return max_length

### **Function that generates the Train and Test matrices**

In [6]:
def create_train_test_matrices(train_abstracts_processed, test_abstracts_processed, vocab, max_length):
    X_train = np.zeros((len(train_abstracts_processed),max_length))
    X_test = np.zeros((len(test_abstracts_processed),max_length))
    for i in range(0,len(train_abstracts_processed)):
        for k in range(0,len(train_abstracts_processed[i])):
            X_train[i][k] = vocab[train_abstracts_processed[i][k]]
    for i in range(0,len(test_abstracts_processed)):
        for k in range(0,len(test_abstracts_processed[i])):
            X_test[i][k] = vocab[test_abstracts_processed[i][k]]
    return X_train, X_test

### **Loading data about each article in a dataframe ( id/year/title/authors/abstract ) **

In [7]:
# Load data about each article in a dataframe
df = pd.read_csv("node_information.csv")
print(df.head())

     id  year                                              title  \
0  1001  2000              compactification geometry and duality   
1  1002  2000  domain walls and massive gauged supergravity p...   
2  1003  2000     comment on metric fluctuations in brane worlds   
3  1004  2000         moving mirrors and thermodynamic paradoxes   
4  1005  2000  bundles of chiral blocks and boundary conditio...   

                       authors  \
0            Paul S. Aspinwall   
1  M. Cvetic, H. Lu, C.N. Pope   
2     Y.S. Myung, Gungwon Kang   
3               Adam D. Helfer   
4      J. Fuchs, C. Schweigert   

                                            abstract  
0  these are notes based on lectures given at tas...  
1  we point out that massive gauged supergravity ...  
2  recently ivanov and volovich hep-th 9912242 cl...  
3  quantum fields responding to moving mirrors ha...  
4  proceedings of lie iii clausthal july 1999 var...  


### **Reading data (document ids and the corresponding journal they were published in).**


### ** We have 28 journals **

In [8]:
# Read training data
train_ids = list()
class_labels = list()
with open('train.csv', 'r') as f:
    next(f)
    for line in f:
        t = line.split(',')
        train_ids.append(t[0])
        class_labels.append(t[1][:-1])

In [9]:
n_train = len(train_ids)
unique = np.unique(class_labels)
print("\nNumber of classes: ", unique.size)


Number of classes:  28


### **Indexing the unique classes in order to use them as Y in Logistic regression**

In [10]:
class_label_to_idx = dict()
for i in range(unique.size):
    class_label_to_idx[unique[i]] = i

In [11]:
class_label_to_idx

{'Acta': 0,
 'Adv.Theor.Math.Phys.': 1,
 'Annals': 2,
 'Class.Quant.Grav.': 3,
 'Commun.Math.Phys.': 4,
 'Eur.Phys.J.': 5,
 'Fortsch.Phys.': 6,
 'Int.': 7,
 'Int.J.Mod.Phys.': 8,
 'Int.J.Theor.Phys.': 9,
 'J.Geom.Phys.': 10,
 'J.Math.Phys.': 11,
 'J.Phys.': 12,
 'JHEP': 13,
 'Lett.Math.Phys.': 14,
 'Mod.': 15,
 'Mod.Phys.Lett.': 16,
 'Nucl.': 17,
 'Nucl.Phys.': 18,
 'Nucl.Phys.Proc.Suppl.': 19,
 'Nuovo': 20,
 'Phys.': 21,
 'Phys.Lett.': 22,
 'Phys.Rev.': 23,
 'Phys.Rev.Lett.': 24,
 'Prog.Theor.Phys.': 25,
 'Theor.Math.Phys.': 26,
 'Z.Phys.': 27}

### **Building y_train matrix by vectorizing which article belongs to which class.**

In [12]:
y_train = np.zeros((n_train, unique.size), dtype=np.int64)
for i in range(n_train):
    y_train[i,class_label_to_idx[class_labels[i]]] = 1

### **Extracting abstracts for training and validation ids**

In [13]:
# Extract the abstract of each training article from the dataframe
train_abstracts = list()
for i in train_ids:
    train_abstracts.append(df.loc[df['id'] == int(i)]['abstract'].iloc[0]+" "+str(df.loc[df['id'] == int(i)]['title'].iloc[0])+ " "+str(df.loc[df['id'] == int(i)]['authors'].iloc[0]))

### **Reading test ids**

In [14]:
# Read test data
test_ids = list()
with open('test.csv', 'r') as f:
    next(f)
    for line in f:
        test_ids.append(line[:-2])

### **Extracting the abstracts that correspond to the test ids**

In [15]:
n_test = len(test_ids)
test_abstracts = list()
for i in test_ids:
    test_abstracts.append(df.loc[df['id'] == int(i)]['abstract'].iloc[0]+" "+str(df.loc[df['id'] == int(i)]['title'].iloc[0])+ " "+str(df.loc[df['id'] == int(i)]['authors'].iloc[0]))

### **Cleaning the data**

In [16]:
train_abstracts_processed = preprocessing(train_abstracts)
test_abstracts_processed = preprocessing(test_abstracts)

### **Combining abstracts of both train and test ids  in a list in order to extract info.**

In [17]:
abstracts = list()
abstracts.extend(train_abstracts_processed)
abstracts.extend(test_abstracts_processed)

### **Collecting the unique words of the abstracts.**

In [18]:
# Extract vocabulary
vocab = get_vocab(abstracts)
print('Vocab size:', len(vocab))

Vocab size: 28946


### **Creating word embeddings from every word in the vocabulary either it exists or not**

In [19]:
embeddings, unknown_words = load_embeddings('GoogleNews-vectors-negative300.bin.gz', vocab)

Existing vecs: 18101


In [20]:
add_unknown_words(embeddings, vocab, unknown_words)

In [21]:
max_length = compute_max_length(abstracts)

### **Creating the training and test matrices.**
- Each row corresponds to an article and each column to a word present in every single article. 
- An element corresponds to the index of a vocabulary word that is present in the article. For example element [i][j] is the index that corresponds to the jth word of article i's abstract.

In [22]:
X_train, X_test = create_train_test_matrices(train_abstracts_processed, test_abstracts_processed, vocab, max_length)

### **Creating CNN model and feeding it with the training matrix.**

In [23]:
# Hyperparameters
num_epochs = 20
batch_size = 100
learning_rate = 0.001

train_loader, test_loader = create_train_test_loaders(X_train, X_test, y_train, batch_size)
           
cnn = CNN(max_length, len(vocab)+1, y_train.shape[1], [2,3], [100,100], embeddings)

#criterion = nn.CrossEntropyLoss()
criterion = nn.NLLLoss()
 
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, cnn.parameters()), lr=learning_rate)

# Train the Model
for epoch in range(num_epochs):
    for i, (abstracts, labels) in enumerate(train_loader):
        abstracts = Variable(abstracts)
        labels = Variable(labels)
        
        optimizer.zero_grad()
        outputs,_ = cnn(abstracts)
        loss = criterion(outputs, torch.max(labels, 1)[1])
        loss.backward()
        optimizer.step()
  #  print("epoch: "+str(epoch+1)+"   loss: "+str(loss.data.numpy()[0]))

### **Predicting the classes of test article ids**

In [24]:
# Make predictions
y_pred = np.zeros((n_test, y_train.shape[1]))
cnn.eval() 
for i, (abstract,_) in enumerate(test_loader):
    abstracts = Variable(abstract)
    _,outputs = cnn(abstract)
    y_pred[i,:] = outputs.data.numpy()

### **Writing results to csv file in order to evaluate the model in Kaggle**


In [25]:
# Write predictions to a file
with open('text_cnn.csv', 'w')as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = unique.tolist()
    lst.insert(0, "Article")
    writer.writerow(lst)
    for i,test_id in enumerate(test_ids):
        lst = y_pred[i,:].tolist()
        lst.insert(0, test_id)
        writer.writerow(lst)

### <span style="color:navy"> Kaggle evaluation : 1.99424 <span>