In [1]:
import nltk
from nltk import word_tokenize
import string
import numpy as np
import math
import csvkit
from nltk.stem import WordNetLemmatizer
import pandas as pd

In [2]:
# download package; 
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
# run the code above just once

def clean(text):
    text = text.lower()
    printable = set(string.printable)
    text = filter(lambda x: x in printable, text)
    text = "".join(list(text))
    return text

In [4]:
def extract_key(input_text):
    Cleaned_text = clean(input_text)
    text = word_tokenize(Cleaned_text)

    POS_tag = nltk.pos_tag(text)

    wordnet_lemmatizer = WordNetLemmatizer()

    adjective_tags = ['JJ','JJR','JJS']

    lemmatized_text = []

    for word in POS_tag:
        if word[1] in adjective_tags:
            lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a")))
        else:
            lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0])))
    POS_tag = nltk.pos_tag(lemmatized_text)
    stopwords = []

    wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','VBG','FW'] 

    for word in POS_tag:
        if word[1] not in wanted_POS:
            stopwords.append(word[0])

    punctuations = list(str(string.punctuation))

    stopwords = stopwords + punctuations
    stopword_file = open("long_stopwords.txt", "r")
    lots_of_stopwords = []

    for line in stopword_file.readlines():
        lots_of_stopwords.append(str(line.strip()))
    stopword_file.close()

    stopwords_plus = []
    stopwords_plus = stopwords + lots_of_stopwords
    stopwords_plus = set(stopwords_plus)
    processed_text = []
    
    for word in lemmatized_text:
        if word not in stopwords_plus:
            processed_text.append(word)
    vocabulary = list(set(processed_text))
    vocab_len = len(vocabulary)

    weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)

    score = np.zeros((vocab_len),dtype=np.float32)
    window_size = 3
    covered_coocurrences = []

    for i in range(0,vocab_len):
        score[i]=1
        for j in range(0,vocab_len):
            if j==i:
                weighted_edge[i][j]=0
            else:
                for window_start in range(0,(len(processed_text)-window_size)):

                    window_end = window_start+window_size

                    window = processed_text[window_start:window_end]

                    if (vocabulary[i] in window) and (vocabulary[j] in window):

                        index_of_i = window_start + window.index(vocabulary[i])
                        index_of_j = window_start + window.index(vocabulary[j])

                        # index_of_x is the absolute position of the xth term in the window 
                        # (counting from 0) 
                        # in the processed_text

                        if [index_of_i,index_of_j] not in covered_coocurrences:
                            weighted_edge[i][j]+=1/math.fabs(index_of_i-index_of_j)
                            covered_coocurrences.append([index_of_i,index_of_j])
    inout = np.zeros((vocab_len),dtype=np.float32)

    for i in range(0,vocab_len):
        for j in range(0,vocab_len):
            inout[i]+=weighted_edge[i][j]
    MAX_ITERATIONS = 50
    d=0.85
    threshold = 0.0001 #convergence threshold

    for iter in range(0,MAX_ITERATIONS):
        prev_score = np.copy(score)

        for i in range(0,vocab_len):
            summation = 0
            for j in range(0,vocab_len):
                if weighted_edge[i][j] != 0:
                    summation += (weighted_edge[i][j]/inout[j])*score[j]

            score[i] = (1-d) + d*(summation)

        if np.sum(np.fabs(prev_score-score)) <= threshold: #convergence condition
            break
    phrases = []
    phrase = " "
    for word in lemmatized_text:

        if word in stopwords_plus:
            if phrase!= " ":
                phrases.append(str(phrase).strip().split())
            phrase = " "
        elif word not in stopwords_plus:
            phrase+=str(word)
            phrase+=" "
    unique_phrases = []

    for phrase in phrases:
        if phrase not in unique_phrases:
            unique_phrases.append(phrase)

    for word in vocabulary:
        #print word
        for phrase in unique_phrases:
            if (word in phrase) and ([word] in unique_phrases) and (len(phrase)>1):
                #if len(phrase)>1 then the current phrase is multi-worded.
                #if the word in vocabulary is present in unique_phrases as a single-word-phrase
                # and at the same time present as a word within a multi-worded phrase,
                # then I will remove the single-word-phrase from the list.
                unique_phrases.remove([word])
  
    phrase_scores = []
    keywords = []
    
    for phrase in unique_phrases:
        phrase_score=0
        keyword = ''
        for word in phrase:
            keyword += str(word)
            keyword += " "
            phrase_score+=score[vocabulary.index(word)]
        phrase_scores.append(phrase_score)
        keywords.append(keyword.strip())

    i=0
    sorted_index = np.flip(np.argsort(phrase_scores),0)

    keywords_num = min(len(keywords),5)
    return keywords[:keywords_num]

In [5]:
# drop the duplicate articles from the origin dataset
dt=pd.read_csv("0_arxiv_affil.csv")
before_tit=''
i = 0
for row in dt['title']:
    if before_tit == row:
        dt=dt.drop([i])
    else:
        before_tit = row
    i = i + 1
    print(i)
dt.to_csv("1_arxiv_affil_no_repli.csv",index=None)

In [6]:
#run for the keyword extracting
dt=pd.read_csv("1_arxiv_affil_no_repli.csv")
data_keyword=[]
i=0
for row in dt['title']:
    extract=extract_key(row)
    data_keyword.append(extract)
    i=i+1
    print(i)
#     print(extract)
dt['keyword'] = data_keyword
dt.to_csv("2_output.csv",index=None)

In [7]:
#drop the column which keyword is []
data=pd.read_csv("2_output.csv")
i = 0
for row in dt['keyword']:
    if row == "[]":
        data=data.drop([i])
    i = i + 1
    print(i)
data.to_csv("3_output_noNone.csv",index=None)

In [80]:
# add id
data=pd.read_csv("3_output_noNone.csv")
id = []
i=0
for row in dt['title']:
    id.append(i)
    i=i+1
#     print(extract)

dt['id'] = id
dt.to_csv("4_output_withid.csv",index=None)

In [81]:
# starting the matching part
data=pd.read_csv("4_output_withid.csv")

In [89]:
dt = data

In [90]:
def MD(array1,array2):#1对于2的匹配度
    m=len(set(array1) & set(array2))
    md = m/len(set(array2))
    return md

In [91]:
def takeSecond(elem):
    return elem[1]

In [None]:
relatePaper=[]
index=0
while index<4000:
    i=0
    this_rlpp = []
    this_paper = dt.iloc[index]["keyword"]
#     print(dt.iloc[index]["title"])
    this_field = dt.iloc[index]["categories"]
    for row in dt['title']:
        if (i!=index) & (dt.iloc[i]["categories"] == this_field):
            match_degree=MD(row,this_paper)
            if (match_degree>=0.5):
                this_rlpp.append([i,match_degree])
#                 print(row +" "+dt.iloc[i]["title"])
        i=i+1
        if len(this_rlpp)>5:
            break;
    this_rlpp.sort(key=takeSecond,reverse=True)
    rlpp=this_rlpp[:3]#filter the top 3
    relatePaper.append(rlpp)
    index = index+1
    print(index)
dt['relatePaper'] = relatePaper

In [94]:
dt.to_csv("final_output.csv",index=None)