# ADMAGD - NIPS dataset

## Importing libraries

In [74]:
import sys
from pathlib import Path
import os
import numpy as np
from gensim import corpora, models
from collections import defaultdict
import re
from string import punctuation

In [72]:
# Adjust the number of parent calls based on the nesting level
root_path = str(Path(os.getcwd()).resolve().parent)  
sys.path.append(root_path)

In [73]:
import model

## Data fetching

In [2]:
papers = pd.read_csv('dataset/papers.csv')
authors = pd.read_csv('dataset/authors.csv')
paper_authors = pd.read_csv('dataset/paper_authors.csv')

In [3]:
papers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7241 entries, 0 to 7240
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          7241 non-null   int64 
 1   year        7241 non-null   int64 
 2   title       7241 non-null   object
 3   event_type  2422 non-null   object
 4   pdf_name    7241 non-null   object
 5   abstract    7241 non-null   object
 6   paper_text  7241 non-null   object
dtypes: int64(2), object(5)
memory usage: 396.1+ KB


In [4]:
papers.dropna(axis=0, how='any', inplace=True)

In [5]:
papers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2422 entries, 4261 to 6947
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          2422 non-null   int64 
 1   year        2422 non-null   int64 
 2   title       2422 non-null   object
 3   event_type  2422 non-null   object
 4   pdf_name    2422 non-null   object
 5   abstract    2422 non-null   object
 6   paper_text  2422 non-null   object
dtypes: int64(2), object(5)
memory usage: 151.4+ KB


In [6]:
papers.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
4261,4857,2013,Scalable Influence Estimation in Continuous-Ti...,Oral,4857-scalable-influence-estimation-in-continuo...,If a piece of information is released from a m...,Scalable Influence Estimation in\nContinuous-T...
4262,4858,2013,Adaptive Anonymity via,Spotlight,4858-adaptive-anonymity-via-b-matching.pdf,The adaptive anonymity problem is formalized w...,Adaptive Anonymity via b-Matching\n\nKrzysztof...
4263,4859,2013,Exact and Stable Recovery of Pairwise Interact...,Spotlight,4859-exact-and-stable-recovery-of-pairwise-int...,Tensor completion from incomplete observations...,Exact and Stable Recovery of Pairwise Interact...
4265,4860,2013,Matrix factorization with binary components,Spotlight,4860-matrix-factorization-with-binary-componen...,Motivated by an application in computational b...,Matrix factorization with Binary Components\n\...
4266,4861,2013,On the Complexity and Approximation of Binary ...,Spotlight,4861-on-the-complexity-and-approximation-of-bi...,Lifted inference algorithms exploit symmetries...,On the Complexity and Approximation of\nBinary...


In [7]:
authors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9784 entries, 0 to 9783
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      9784 non-null   int64 
 1   name    9783 non-null   object
dtypes: int64(1), object(1)
memory usage: 153.0+ KB


In [8]:
authors.head()

Unnamed: 0,id,name
0,1,Hisashi Suzuki
1,10,David Brady
2,100,Santosh S. Venkatesh
3,1000,Charles Fefferman
4,10000,Artur Speiser


In [9]:
paper_authors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20838 entries, 0 to 20837
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   id         20838 non-null  int64
 1   paper_id   20838 non-null  int64
 2   author_id  20838 non-null  int64
dtypes: int64(3)
memory usage: 488.5 KB


In [10]:
paper_authors.head()

Unnamed: 0,id,paper_id,author_id
0,1,63,94
1,2,80,124
2,3,80,125
3,4,80,126
4,5,80,127


## Data Preparing

In [11]:
new_papers = papers.sort_values(by='id')[['id', 'year', 'title', 'paper_text']]
new_papers.columns = ['paper_id', 'year', 'title', 'paper_text']

In [12]:
new_papers

Unnamed: 0,paper_id,year,title,paper_text
4261,4857,2013,Scalable Influence Estimation in Continuous-Ti...,Scalable Influence Estimation in\nContinuous-T...
4262,4858,2013,Adaptive Anonymity via,Adaptive Anonymity via b-Matching\n\nKrzysztof...
4263,4859,2013,Exact and Stable Recovery of Pairwise Interact...,Exact and Stable Recovery of Pairwise Interact...
4265,4860,2013,Matrix factorization with binary components,Matrix factorization with Binary Components\n\...
4266,4861,2013,On the Complexity and Approximation of Binary ...,On the Complexity and Approximation of\nBinary...
...,...,...,...,...
6943,7280,2017,"On Separability of Loss Functions, and Revisit...","On Separability of Loss Functions, and Revisit..."
6944,7281,2017,Maxing and Ranking with Few Assumptions,Maxing and Ranking with Few Assumptions\nMoein...
6945,7282,2017,On clustering network-valued data,On clustering network-valued data\n\nSoumendu ...
6946,7283,2017,A General Framework for Robust Interactive Lea...,A General Framework for Robust Interactive\nLe...


In [13]:
authors.columns = ['author_id', 'author_name']
merged_paper_author = pd.merge(paper_authors, authors, on ='author_id')[['paper_id', 'author_id', 'author_name']]

In [14]:
merged_paper_author

Unnamed: 0,paper_id,author_id,author_name
0,63,94,Yaser S. Abu-Mostafa
1,157,94,Yaser S. Abu-Mostafa
2,206,94,Yaser S. Abu-Mostafa
3,616,94,Yaser S. Abu-Mostafa
4,930,94,Yaser S. Abu-Mostafa
...,...,...,...
20833,7281,10425,Vaishakh Ravindrakumar
20834,7282,10427,Soumendu Sundar Mukherjee
20835,7282,10428,Lizhen Lin
20836,7283,10429,Ehsan Emamjomeh-Zadeh


In [15]:
merged_paper_author = merged_paper_author.groupby('paper_id').agg(list).reset_index()

In [16]:
merged_paper_author

Unnamed: 0,paper_id,author_id,author_name
0,1,"[1, 2]","[Hisashi Suzuki, Suguru Arimoto]"
1,2,[3],[Philip A. Chou]
2,3,"[252, 7094]","[Eric B. Baum, Frank Wilczek]"
3,4,"[4, 5]","[John C. Platt, Alan H. Barr]"
4,5,[6],[Ralph Linsker]
...,...,...,...
7233,7280,"[6369, 7267, 10437]","[Pradeep K. Ravikumar, Adarsh Prasad, Alexandr..."
7234,7281,"[521, 8683, 10376, 10426, 10425]","[Alon Orlitsky, Moein Falahatgar, Venkatadheer..."
7235,7282,"[6868, 10427, 10428]","[Purnamrita Sarkar, Soumendu Sundar Mukherjee,..."
7236,7283,"[8662, 10429]","[David Kempe, Ehsan Emamjomeh-Zadeh]"


In [17]:
merged_data = pd.merge(new_papers, merged_paper_author, on ='paper_id')

In [18]:
merged_data

Unnamed: 0,paper_id,year,title,paper_text,author_id,author_name
0,4857,2013,Scalable Influence Estimation in Continuous-Ti...,Scalable Influence Estimation in\nContinuous-T...,"[2173, 3161, 5356, 6268]","[Hongyuan Zha, Le Song, Nan Du, Manuel Gomez R..."
1,4858,2013,Adaptive Anonymity via,Adaptive Anonymity via b-Matching\n\nKrzysztof...,"[1766, 6269, 6270]","[Tony Jebara, Krzysztof M. Choromanski, Kui Tang]"
2,4859,2013,Exact and Stable Recovery of Pairwise Interact...,Exact and Stable Recovery of Pairwise Interact...,"[3777, 3779, 6271, 6272]","[Zenglin Xu, Irwin King, Shouyuan Chen, Michae..."
3,4860,2013,Matrix factorization with binary components,Matrix factorization with Binary Components\n\...,"[2732, 4931, 6273]","[Matthias Hein, Martin Slawski, Pavlo Lutsik]"
4,4861,2013,On the Complexity and Approximation of Binary ...,On the Complexity and Approximation of\nBinary...,"[2410, 6274]","[Adnan Darwiche, Guy Van den Broeck]"
...,...,...,...,...,...,...
2414,7280,2017,"On Separability of Loss Functions, and Revisit...","On Separability of Loss Functions, and Revisit...","[6369, 7267, 10437]","[Pradeep K. Ravikumar, Adarsh Prasad, Alexandr..."
2415,7281,2017,Maxing and Ranking with Few Assumptions,Maxing and Ranking with Few Assumptions\nMoein...,"[521, 8683, 10376, 10426, 10425]","[Alon Orlitsky, Moein Falahatgar, Venkatadheer..."
2416,7282,2017,On clustering network-valued data,On clustering network-valued data\n\nSoumendu ...,"[6868, 10427, 10428]","[Purnamrita Sarkar, Soumendu Sundar Mukherjee,..."
2417,7283,2017,A General Framework for Robust Interactive Lea...,A General Framework for Robust Interactive\nLe...,"[8662, 10429]","[David Kempe, Ehsan Emamjomeh-Zadeh]"


In [19]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2419 entries, 0 to 2418
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   paper_id     2419 non-null   int64 
 1   year         2419 non-null   int64 
 2   title        2419 non-null   object
 3   paper_text   2419 non-null   object
 4   author_id    2419 non-null   object
 5   author_name  2419 non-null   object
dtypes: int64(2), object(4)
memory usage: 113.5+ KB


## Data Cleaning

using the dropna() method from pandas on a DataFrame named papers. This method is used to remove missing values from the DataFrame. Let's break down the parameters used in this method call to understand what it does:

axis=0: This parameter specifies the axis along which the method is applied. axis=0 means the operation is performed along the rows. In other words, it looks for missing values in rows. Alternatively, axis=1 would mean the operation is performed along the columns, looking for missing values in columns.

how='any': This parameter determines how rows or columns are dropped when they have missing values (NaN). 'any' means that if any value in the row (or column, depending on the axis parameter) is NaN, that row (or column) will be dropped. The other option is 'all', which would only drop rows (or columns) where all values are NaN.

inplace=True: This parameter specifies whether to modify the original DataFrame (papers) directly. True means that the operation will modify the papers DataFrame in place, and no new DataFrame is created. If it were False (or omitted, as False is the default value), the operation would return a new DataFrame with the changes, leaving the original papers DataFrame unchanged.

So, to summarize, papers.dropna(axis=0, how='any', inplace=True) removes any rows from the papers DataFrame that contain at least one missing value (NaN). This operation modifies the papers DataFrame directly, rather than creating a new DataFrame with the changes.

In [20]:
cleaned_data = merged_data.dropna(axis=0, how='any')

In [21]:
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2419 entries, 0 to 2418
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   paper_id     2419 non-null   int64 
 1   year         2419 non-null   int64 
 2   title        2419 non-null   object
 3   paper_text   2419 non-null   object
 4   author_id    2419 non-null   object
 5   author_name  2419 non-null   object
dtypes: int64(2), object(4)
memory usage: 113.5+ KB


## Author Extraction

In [44]:
cleaned_data

Unnamed: 0,paper_id,year,title,paper_text,author_id,author_name
0,4857,2013,Scalable Influence Estimation in Continuous-Ti...,Scalable Influence Estimation in\nContinuous-T...,"[2173, 3161, 5356, 6268]","[Hongyuan Zha, Le Song, Nan Du, Manuel Gomez R..."
1,4858,2013,Adaptive Anonymity via,Adaptive Anonymity via b-Matching\n\nKrzysztof...,"[1766, 6269, 6270]","[Tony Jebara, Krzysztof M. Choromanski, Kui Tang]"
2,4859,2013,Exact and Stable Recovery of Pairwise Interact...,Exact and Stable Recovery of Pairwise Interact...,"[3777, 3779, 6271, 6272]","[Zenglin Xu, Irwin King, Shouyuan Chen, Michae..."
3,4860,2013,Matrix factorization with binary components,Matrix factorization with Binary Components\n\...,"[2732, 4931, 6273]","[Matthias Hein, Martin Slawski, Pavlo Lutsik]"
4,4861,2013,On the Complexity and Approximation of Binary ...,On the Complexity and Approximation of\nBinary...,"[2410, 6274]","[Adnan Darwiche, Guy Van den Broeck]"
...,...,...,...,...,...,...
2414,7280,2017,"On Separability of Loss Functions, and Revisit...","On Separability of Loss Functions, and Revisit...","[6369, 7267, 10437]","[Pradeep K. Ravikumar, Adarsh Prasad, Alexandr..."
2415,7281,2017,Maxing and Ranking with Few Assumptions,Maxing and Ranking with Few Assumptions\nMoein...,"[521, 8683, 10376, 10426, 10425]","[Alon Orlitsky, Moein Falahatgar, Venkatadheer..."
2416,7282,2017,On clustering network-valued data,On clustering network-valued data\n\nSoumendu ...,"[6868, 10427, 10428]","[Purnamrita Sarkar, Soumendu Sundar Mukherjee,..."
2417,7283,2017,A General Framework for Robust Interactive Lea...,A General Framework for Robust Interactive\nLe...,"[8662, 10429]","[David Kempe, Ehsan Emamjomeh-Zadeh]"


In [45]:
authors_list = cleaned_data['author_name'].values.tolist()

In [49]:
author2doc = {}

for paper_index, author_array in enumerate(authors_list):
    for author_name in author_array:
        if author_name not in author2doc:
            author2doc[author_name] = []
        author2doc[author_name].append(paper_index)

## Pre-processing

#### STOP WORDS creation
Initial stop words from `sklearn.feature_extraction._stop_words`

In [26]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS # Total 318 words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
nltk_stop_words = list(set(stopwords.words('english')))
sk_stop_words = list(ENGLISH_STOP_WORDS)

STOP_WORDS = list(set(nltk_stop_words + sk_stop_words))

In [28]:
# STOP_WORDS += ["use", "think", "thanks", "know", "like", "make", "say", "time", "use", "need", "want", "come" ]

#### Lemmatizer
Lemmatizer minimizes text ambiguity. Example words like bicycle or bicycles are converted to base word bicycle. Basically, it will convert all words having the same meaning but different representation to their base form. It reduces the word density in the given text and helps in preparing the accurate features for training machine. Cleaner the data, the more intelligent and accurate your machine learning model, will be. NLTK Lemmatizer will also saves memory as well as computational cost.

In [29]:
import nltk
nltk.download('punkt')
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
nltk.download('averaged_perceptron_tagger') # need for pos_tag

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [30]:
# Initialization
lemmatizer = WordNetLemmatizer()

# Creating a POS tag map
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

def convertWordIntoLemmatizeWord(words):
  return [lemmatizer.lemmatize(word, tag_map[tag[0]]) for word, tag in pos_tag(words)]

#### Creating a pre-process function
* Remove numbers. ✅
* Convert word into lowercase word. ✅
* Remove all stop words. ✅
* Remove all punctuations. ✅
* Some white spaces may be added to the list of words, due to the translate function & nature of our documents. Remove them as well. ✅
* Remove just-numeric strings. ✅
* Lemmatize. ✅
* Remove words with only 2 characters or less. [Low frequency] ✅
* Remove words with more than 12 characters. [High frequency] ✅

In [31]:
def preprocess(words):
  #First, remove numbers
  words = [re.sub(r"\d+", "", word) for word in words]
  #Normalize the cases of our words
  words = [word.lower() for word in words]
  #Remove all punctuations
  table = str.maketrans('', '', punctuation)
  words = [word.translate(table) for word in words]
  #Some white spaces may be added to the list of words, due to the translate function & nature of our documents. We've to remove them.
  words = [word for word in words if word]
  #Remove just-numeric strings
  words = [word for word in words if not word.isdigit()]
  #Remove all stop words
  words = [word for word in words if word not in STOP_WORDS]
  #Lemmatize
  words = convertWordIntoLemmatizeWord(words)
  #Remove all stop words
  words = [word for word in words if word not in STOP_WORDS]
  #Remove words with less than 3 characters and more than 20 characters
  words = [word for word in words if len(word) > 2 and len(word) <= 20]
  return words

In [32]:
def preprocess_documents(docs, verbose=False):
  preprocessed_docs = []
  for i, doc in enumerate(docs):
    if verbose:
      print(f"Item: {i+1}")
    words = word_tokenize(doc)
    words = preprocess(words)
    # preprocessed_docs.append(" ".join(words))
    preprocessed_docs.append(words)
  return preprocessed_docs

In [33]:
paper_text_list = cleaned_data['paper_text'].values.tolist()

In [34]:
preprocessed_paper_text_list = preprocess_documents(paper_text_list, True)

Item: 1
Item: 2
Item: 3
Item: 4
Item: 5
Item: 6
Item: 7
Item: 8
Item: 9
Item: 10
Item: 11
Item: 12
Item: 13
Item: 14
Item: 15
Item: 16
Item: 17
Item: 18
Item: 19
Item: 20
Item: 21
Item: 22
Item: 23
Item: 24
Item: 25
Item: 26
Item: 27
Item: 28
Item: 29
Item: 30
Item: 31
Item: 32
Item: 33
Item: 34
Item: 35
Item: 36
Item: 37
Item: 38
Item: 39
Item: 40
Item: 41
Item: 42
Item: 43
Item: 44
Item: 45
Item: 46
Item: 47
Item: 48
Item: 49
Item: 50
Item: 51
Item: 52
Item: 53
Item: 54
Item: 55
Item: 56
Item: 57
Item: 58
Item: 59
Item: 60
Item: 61
Item: 62
Item: 63
Item: 64
Item: 65
Item: 66
Item: 67
Item: 68
Item: 69
Item: 70
Item: 71
Item: 72
Item: 73
Item: 74
Item: 75
Item: 76
Item: 77
Item: 78
Item: 79
Item: 80
Item: 81
Item: 82
Item: 83
Item: 84
Item: 85
Item: 86
Item: 87
Item: 88
Item: 89
Item: 90
Item: 91
Item: 92
Item: 93
Item: 94
Item: 95
Item: 96
Item: 97
Item: 98
Item: 99
Item: 100
Item: 101
Item: 102
Item: 103
Item: 104
Item: 105
Item: 106
Item: 107
Item: 108
Item: 109
Item: 110
Item: 11

In [35]:
preprocessed_text_path = f"preprocessed_text/{len(preprocessed_paper_text_list)}"
# preprocessed_text_path = f"preprocessed_text/7238"

In [36]:
# Serialize and save to a file
with open(f"{preprocessed_text_path}.pkl", 'wb') as f:
    pickle.dump(preprocessed_paper_text_list, f)

In [37]:
# Load the array back
with open(f"{preprocessed_text_path}.pkl", 'rb') as f:
    loaded_preprocessed_paper_text_list = pickle.load(f)

In [38]:
print(type(loaded_preprocessed_paper_text_list[0]))

<class 'list'>


In [109]:
dictionary = corpora.Dictionary(loaded_preprocessed_paper_text_list)

In [110]:
print(len(dictionary))

112233


In [111]:
dictionary.filter_extremes(no_below=50, no_above=0.5)

In [112]:
print(len(dictionary))

4491


In [113]:
bow_corpus = [dictionary.doc2bow(doc) for doc in loaded_preprocessed_paper_text_list]

## Model Training

In [114]:
tfidf = models.TfidfModel(bow_corpus)

In [115]:
corpus_tfidf = tfidf[bow_corpus]

In [119]:
# Initialize alpha, beta, a, and b if different from the defaults
alpha_init = 0.1
beta_init = 0.1
a_init = 0.1
b_init = 0.1

num_topics = 10

iterations = 100

In [120]:
# Instantiate the ADMAGD model
train_model = model.ADMAGD(corpus=corpus_tfidf, num_topics=num_topics, id2word=dictionary, authors=author2doc, alpha_init=alpha_init, beta_init=beta_init, a_init=a_init, b_init=b_init)

In [121]:
# Run Gibbs sampling
train_model.gibbs_sampling(iterations=iterations)

iteration: 1


## Store Model

### Pickle

In [None]:
import pickle

In [None]:
model_file_name = f"nips_{len(preprocessed_paper_text_list)}_iteration_{iterations}"

Save the model to a file

In [None]:
with open(f"trained_ model/{model_file_name}.pkl", 'wb') as f:
    pickle.dump(train_model, f)

### joblib

In [None]:
from joblib import dump

Save the model to a file

In [None]:
dump(train_model, f"trained_ model/{model_file_name}.joblib")