# Neural network

We'll try to get a better score with a neural network with the following features :

- TFIDF with titles
- TFIDF with abstracts
- Differences between years
- Number of common authors
- Common journal or not

In [11]:
import numpy as np
import sklearn as skl
import csv
import nltk
import pandas as pd
import matplotlib.pyplot as plt

stemmer = nltk.stem.PorterStemmer()

In [3]:
# Reading of the information matrix

node_inf_raw = pd.read_csv("./node_information.csv")
node_inf = node_inf_raw.values
for i in range(len(node_inf)):
    if type(node_inf[i][3]) == float:
        node_inf[i][3] = set()
    else:
        node_inf[i][3] = set(node_inf[i][3].split(", "))
    
    if type(node_inf[i][4]) == float:
        node_inf[i][4] = ''

In [8]:
# Reverse index storing to save time when comparing nodes

reverse_index = dict()

for i in range(len(node_inf)):
    reverse_index[node_inf[i,0]] = i

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# compute TFIDF vector of each paper (abstract + title)
corpus_titles    = [' '.join([stemmer.stem(a) for a in nltk.tokenize.word_tokenize(element[2])]) for element in node_inf]
corpus_abstracts = [' '.join([stemmer.stem(a) for a in nltk.tokenize.word_tokenize(element[5])]) for element in node_inf]

vectorizer_titles   = TfidfVectorizer(stop_words="english", )
vectorizer_abstract = TfidfVectorizer(stop_words="english")
# each row is a node in the order of node_info
features_TFIDF_titles    = vectorizer_titles.fit_transform(corpus_titles)
features_TFIDF_abstracts = vectorizer_abstract.fit_transform(corpus_abstracts)

In [36]:
def TFIDF_corr(ID1: int, ID2: int):
    return [(features_TFIDF_titles[reverse_index[ID1]   ].dot(features_TFIDF_titles[   reverse_index[ID2]].T))[0,0],
            (features_TFIDF_abstracts[reverse_index[ID1]].dot(features_TFIDF_abstracts[reverse_index[ID2]].T))[0,0]]

In [24]:
train_raw = np.array(pd.read_csv("./paul_my_train.csv").values)
test_raw  = np.array(pd.read_csv("./paul_my_test.csv" ).values)

In [27]:
train_TFIDF = np.array([TFIDF_corr(e[0], e[1]) for e in train_raw])
test_TFIDF  = np.array([TFIDF_corr(e[0], e[1]) for e in test_raw ])

In [31]:
train_year_diff = np.array([abs(node_inf[reverse_index[e[0]], 1] - node_inf[reverse_index[e[1]], 1]) for e in train_raw])
test_year_diff  = np.array([abs(node_inf[reverse_index[e[0]], 1] - node_inf[reverse_index[e[1]], 1]) for e in test_raw ])

In [46]:
train_common_authors = np.array(
    [len(node_inf[reverse_index[e[0]], 3].intersection(node_inf[reverse_index[e[1]], 3])) for e in train_raw]
)
test_common_authors  = np.array(
    [len(node_inf[reverse_index[e[0]], 3].intersection(node_inf[reverse_index[e[1]], 3])) for e in test_raw ]
)

In [64]:
train_common_journal = np.array(
    [(node_inf[reverse_index[e[0]], 4] != '' 
      and node_inf[reverse_index[e[0]], 4] == node_inf[reverse_index[e[1]], 4]) * 1 for e in train_raw]
)
test_common_journal  = np.array(
    [(node_inf[reverse_index[e[0]], 4] != '' 
      and node_inf[reverse_index[e[0]], 4] == node_inf[reverse_index[e[1]], 4]) * 1 for e in test_raw]
)

In [96]:
train_year_diff      = np.reshape(train_year_diff,      (len(train_raw),1))
train_common_authors = np.reshape(train_common_authors, (len(train_raw),1))
train_common_journal = np.reshape(train_common_journal, (len(train_raw),1))
X_train = np.concatenate((train_TFIDF, train_year_diff, train_common_authors, train_common_journal), axis=1)
y_train = train_raw[:, 2]

test_year_diff      = np.reshape(test_year_diff,      (len(test_raw),1))
test_common_authors = np.reshape(test_common_authors, (len(test_raw),1))
test_common_journal = np.reshape(test_common_journal, (len(test_raw),1))
X_test = np.concatenate((test_TFIDF, test_year_diff, test_common_authors, test_common_journal), axis=1)
y_test = test_raw[:, 2]

In [101]:
from sklearn import preprocessing

In [110]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
transformed = scaler.transform(X_train)
print(transformed, "\n\n", X_train)
scaler.scale_

[[ 0.55936389  0.1959633  -1.14733099 -0.20162147  2.8339985 ]
 [ 0.72749016 -0.36635898 -0.73658914 -0.20162147 -0.35285834]
 [-0.52470042 -0.82658825 -0.3258473  -0.20162147 -0.35285834]
 ...
 [-0.52470042  0.04324563 -0.3258473  -0.20162147 -0.35285834]
 [-0.52470042  0.52816893  0.08489455 -0.20162147 -0.35285834]
 [-0.52470042 -0.11580755 -0.73658914 -0.20162147  2.8339985 ]] 

 [[0.15737467 0.12018774 0.         0.         1.        ]
 [0.18178172 0.06381189 1.         0.         0.        ]
 [0.         0.01767141 2.         0.         0.        ]
 ...
 [0.         0.10487697 2.         0.         0.        ]
 [0.         0.15349316 3.         0.         0.        ]
 [0.         0.08893103 1.         0.         1.        ]]


array([0.14517097, 0.10025542, 2.43461924, 0.35630459, 0.31378881])

In [89]:
import tensorflow as tf

In [99]:
inputs = tf.keras.Input(shape=(5,))

x = tf.keras.layers.Dense(64, activation='sigmoid')(inputs)
x = tf.keras.layers.Dense(64, activation='sigmoid')(x)

predictions = tf.keras.layers.Dense(2, activation='softmax')(x)

model = tf.keras.Model(inputs=inputs, outputs=predictions)
model.compile(optimizer='adam', loss=tf.losses.softmax_cross_entropy, metrics=['accuracy'])

In [100]:
model.fit(X_train, y_train, batch_size=50, epochs=10)

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 