In [None]:
import json
import os
from os.path import join
import csv
import random
from lxml import etree
from fuzzywuzzy import fuzz
import re
from collections import defaultdict as dd
from tqdm import tqdm

import utils

import logging

import numpy as np

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import ast

import glob

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')  # include timestamp

random.seed(1)

import settings
dir_path = settings.DIR_PATH

### Processing Open Academic Papers Dataset

In [None]:
paper_dict_open = {}
dblp_fname = "DBLP-Citation-network-V15.1.json"
with open(join(dir_path, dblp_fname), "r", encoding="utf-8") as myFile:
    for i, line in enumerate(myFile):
        if len(line) <= 2:
            continue
        if i % 10000 == 0: 
            logger.info("reading papers %d", i)
        paper_tmp = json.loads(line.strip())
        paper_dict_open[paper_tmp["id"]] = paper_tmp

import pandas as pd
temp = pd.DataFrame(paper_dict_open)
temp = temp.T.reset_index(drop = True)

dblp_df = temp.copy()

paper_dict_open = {}
dblp_fname = "v3.1_oag_publication_14.json"
with open(join(dir_path, dblp_fname), "r", encoding="utf-8") as myFile:
    for i, line in enumerate(myFile):
        if len(line) <= 2:
            continue
        if i % 10000 == 0: 
            logger.info("reading papers %d", i)
        paper_tmp = json.loads(line.strip())
        paper_dict_open[paper_tmp["id"]] = paper_tmp

import pandas as pd
temp = pd.DataFrame(paper_dict_open)
temp = temp.T.reset_index(drop = True)

oag_df = temp.copy()

s1 = set(oag_df.id)
s2 = set(dblp_df.id)
d = s1 - s2

del temp

filtered_oag_df = oag_df[oag_df['id'].isin(d)]
dblp_df = pd.concat([dblp_df,filtered_oag_df])
dblp_df.to_pickle(dir_path + 'OAG_DBLP-Citation-network.pkl')

del oag_df


### parse XML dataset

In [None]:
def parse_metadata(root, namespaces):

    abstract_node = root.find('.//tei:profileDesc/tei:abstract', namespaces=namespaces)
    abstract = ' '.join(abstract_node.itertext()) if abstract_node is not None else "No abstract found"
    return abstract

def parse_references(root, namespaces):
    references = {}
    for bibl in root.xpath('.//tei:biblStruct', namespaces=namespaces):
        title = bibl.xpath('.//tei:title[@type="main"]', namespaces=namespaces)
        authors = []
        for author in bibl.xpath('.//tei:author', namespaces=namespaces):
            forename = author.xpath('.//tei:persName/tei:forename/text()', namespaces=namespaces)
            surname = author.xpath('.//tei:persName/tei:surname/text()', namespaces=namespaces)
            authors.append(' '.join(forename + surname))
        authors = ', '.join(authors) 
        if title:
            ref_id = bibl.get('{http://www.w3.org/XML/1998/namespace}id', '')
            references[ref_id] = {'title': title[0].text if title[0].text else "Unknown Title", 'authors': authors}
    return references

def extract_citations(root, namespaces, references):
    context_data = []
    citation_frequency = {}
    section_titles = {}

    current_section = "Introduction"  
    for elem in root.iter():
        if elem.tag.endswith('head'):
            current_section = ''.join(elem.itertext()).strip()  

        if elem.tag.endswith('ref') and elem.get('type') == 'bibr':
            target = elem.get('target').strip('#') if elem.get('target') else None
            if target:
                citation_frequency[target] = citation_frequency.get(target, 0) + 1

                if target in references:
                    citation_info = references[target]
                    parent_paragraph = elem.getparent()
                    parent_text = ''.join(parent_paragraph.itertext())
                    elem_text = ''.join(elem.itertext())
                    
                    try:
                        context_index = parent_text.index(elem_text)
                        pre_text = parent_text[max(0, context_index-50):context_index].strip()
                        post_text = parent_text[context_index+len(elem_text):context_index+len(elem_text)+50].strip()
                    except ValueError:
                        pre_text = "Text not found"
                        post_text = "Text not found"

                    context_data.append({
                        'Citation Number': citation_frequency[target],
                        'ref_title': citation_info['title'],
                        'ref_authors': citation_info['authors'],
                        'Pre Text': pre_text,
                        'Post Text': post_text,
                        'Section': current_section
                    })

    return context_data, citation_frequency, section_titles


def aggregate_data(xml_list, dir_path):
    all_data = []
    namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}
    
    for xml_path in tqdm(xml_list, desc="Processing XML files"):
        try:
            tree = etree.parse(xml_path)
            root = tree.getroot()
            
            abstract = parse_metadata(root, namespaces)
            references = parse_references(root, namespaces)
            context_data, citation_frequency, section_titles = extract_citations(root, namespaces, references)  # Unpack the returned tuple correctly

            for data in context_data:  # Assure 'data' is a dictionary as expected
                data.update({'_id': os.path.basename(xml_path[:-4])})
                all_data.append(data)

        except etree.XMLSyntaxError as e:
            print(f"Error parsing {xml_path}: {str(e)}")
    
    return pd.DataFrame(all_data)


# Define the directory path and get the list of XML files
xml_list = glob.glob(os.path.join(dir_path, 'paper-xml', '*.xml'))

# Process all XML files and print the aggregated DataFrame
df_aggregated = aggregate_data(xml_list, dir_path)
print(df_aggregated.head())

temp = df_aggregated.groupby(['ref_title','_id']).count().reset_index()[['ref_title','_id','Section']]

temp.columns = ['ref_title','_id','citation_cnt']
temp = df_aggregated.merge(temp,on = ['ref_title','_id'],how = 'left')

temp = temp.groupby(['citation_cnt', 'ref_title', 'ref_authors', 'Pre Text', 'Post Text','Section','_id']).mean().reset_index()

df_aggregated = temp.groupby(['_id', 'ref_title']).agg({
    'citation_cnt': 'sum',
    'Citation Number': 'mean', 
    'ref_authors': ' '.join,
    'Pre Text': ' '.join,
    'Post Text': ' '.join,
    'Section': ' '.join 
}).reset_index()

df_aggregated.to_pickle(dir_path + 'tempolary_xml_ref.pkl')

In [None]:
def parse_xml(xml_path):
    tree = etree.parse(xml_path)
    root = tree.getroot()
    
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

    _id = os.path.basename(xml_path).split('.')[0]

    # タイトルを抽出
    title = root.find('.//tei:titleStmt/tei:title', namespaces=ns)
    if title is not None:
        title_text = title.text
    else:
        title_text = "No title found"

    abstract = root.find('.//tei:profileDesc/tei:abstract', namespaces=ns)
    if abstract is not None:
        abstract_text = ' '.join(abstract.itertext())
    else:
        abstract_text = "No abstract found"

    body_text = ""
    body = root.find('.//tei:text/tei:body', namespaces=ns)
    if body is not None:
        body_text = ' '.join(body.itertext())
    else:
        body_text = "No body text found"

    authors = []
    for author in root.findall('.//tei:sourceDesc//tei:author', namespaces=ns):
        forenames = author.xpath('.//tei:persName/tei:forename/text()', namespaces=ns)
        surnames = author.xpath('.//tei:persName/tei:surname/text()', namespaces=ns)
        name = ' '.join(forenames + surnames if surnames else forenames)
        email = author.xpath('.//tei:email/text()', namespaces=ns)
        email_text = email[0] if email else "No email provided"
        authors.append((name, email_text))

    
    references = []
    max_index = 0 
    reference_dict = {} 
    
    for bibl in root.findall('.//tei:listBibl/tei:biblStruct', namespaces=ns):
        xml_id = bibl.get("{http://www.w3.org/XML/1998/namespace}id")
        if xml_id and xml_id.startswith('bib'):
            index = int(xml_id[3:]) - 1 
            max_index = max(max_index, index)
        else:
            continue
    
        ref_title = bibl.find('.//tei:title[@type="main"]', namespaces=ns)
        if ref_title is not None and ref_title.text:
            ref_title_text = ref_title.text
        else:
            ref_title_text = "No title"
            valid_entry = False
    
        ref_authors = []
        for ref_author in bibl.findall('.//tei:author/tei:persName', namespaces=ns):
            ref_forenames = ref_author.xpath('.//tei:forename/text()', namespaces=ns)
            ref_surnames = ref_author.xpath('.//tei:surname/text()', namespaces=ns)
            if ref_forenames or ref_surnames:
                ref_author_name = ' '.join(ref_forenames + ref_surnames if ref_surnames else ref_forenames)
                ref_authors.append(ref_author_name)
            else:
                valid_entry = False
    
        reference_dict[index] = {'title': ref_title_text, 'authors': ref_authors, 'valid': True}
    
    references = [{'title': "No title", 'authors': [], 'valid': False}] * (max_index + 1) 
    for index, ref_info in reference_dict.items():
        references[index] = ref_info 

    
    # Attempt to extract publication date
    date_nodes = root.xpath('.//*[local-name()="date"]')
    publication_date = "No date found"
    for date in date_nodes:
        if 'when' in date.attrib:
            publication_date = date.attrib['when']
            break  # Assumes first matching 'when' attribute is the correct one

    # Attempt to extract publication venue
    venue_node = root.xpath('.//*[local-name()="monogr"]/*[local-name()="title"]')
    venue_text = venue_node[0].text if venue_node else "No venue found"


    return _id, title_text, abstract_text, body_text, authors, references, publication_date, venue_text


xml_list = glob.glob(dir_path + 'paper-xml/*')

df_tot = pd.DataFrame()

for i in tqdm(range(0,len(xml_list))):
    _id, title, abstract, body, authors, references ,publication_date, venue_text = parse_xml(xml_list[i])

    
    temp_df = pd.DataFrame({
        '_id': [_id],  
        'title': [title],
        'abstract': [abstract],
        'body': [body],
        'authors': [authors],  
        'references': [references],  
        'publication_date': [publication_date],
        'venue_text': [venue_text]
    })

    df_tot = pd.concat([df_tot,temp_df])

df_tot = df_tot.reset_index(drop = True)

df_tot.to_pickle(dir_path + 'tempolary_xml.pkl')

## text matching using OAG datasets

In [None]:
df_aggregated['ref_title_norm'] = df_aggregated['ref_title'].str.lower().str.replace(r'[\s\-_:.]', '', regex=True)
df_aggregated['ref_title_norm'] = df_aggregated['ref_title_norm'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
df_aggregated['ref_title_norm'] = df_aggregated['ref_title_norm'].str.lower().str.replace(r'[\s\-_:.\',]', '', regex=True)

dblp_df = pd.read_pickle(dir_path + 'OAG_DBLP-Citation-network.pkl')

dblp_df['title'] = dblp_df['title'].str.lower().str.replace(r'[\s\-_:.]', '', regex=True)
dblp_df['title'] = dblp_df['title'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
dblp_df['title'] = dblp_df['title'].str.lower().str.replace(r'[\s\-_:.\',]', '', regex=True)

df = pd.merge(df_aggregated,dblp_df[['id', 'abstract','title','authors', 
                                     'venue', 'n_citation', 'keywords', 'year']],left_on = ['ref_title_norm'], right_on = ['title'],how = 'left')

df = df.rename(columns = {'abstract':'ref_abstract',
                            'venue':'ref_venue','id':'ref_id','year':'ref_year',
                            'n_citation':'ref_n_citation','keywords':'ref_keywords'})


df['ref_authors_list'] = df['ref_authors'].apply(lambda x: [name.strip() for name in x.split(',')] if isinstance(x, str) else x)

def extract_names(authors_list):
    try:
        authors_dict = json.loads(authors_list.replace("'", '"')) 
        names = [author['name'] for author in authors_dict if 'name' in author]
    except:
        names = []
    return names

df['authors_list'] = df['authors'].apply(extract_names)

def calculate_jaccard(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    if len(set1.union(set2)) == 0:
        return 0  
    return len(set1.intersection(set2)) / len(set1.union(set2))

def select_most_similar(group):
    if len(group) == 1:
        return group
    max_sim = 0
    best_index = group.index[0]
    for i, row in group.iterrows():
        sim = calculate_jaccard(row['ref_authors_list'], row['authors_list'])
        if sim > max_sim:
            max_sim = sim
            best_index = i
    return group.loc[[best_index]]

cnt = df.groupby(['ref_title_norm', '_id']).count()
cnt = cnt.loc[cnt['ref_authors'] == 1].reset_index()[['ref_title_norm', '_id']]
cnt['r'] = 1
df = df.merge(cnt, on = ['ref_title_norm', '_id'], how = 'left')

remain_df = df.loc[(df['r']== 1)].copy()
df = df.loc[(df['r']!= 1) & (df['ref_title_norm']!= 'notitle')]

grouped = df.groupby(['ref_title_norm', '_id'])
processed_df = pd.DataFrame()
unprocessed_df = pd.DataFrame()

for name, group in tqdm(grouped, desc="Processing groups"):
    if len(group) > 1:
        result = select_most_similar(group)
        processed_df = pd.concat([processed_df, result])
    else:
        unprocessed_df = pd.concat([unprocessed_df, group])

final_df = pd.concat([processed_df, remain_df]).sort_index()


In [None]:
t = pd.merge(final_df,dblp_df[['id','keywords','year','n_citation','venue']],left_on = '_id', right_on = 'id', how = 'left')
t.drop(columns = {'ref_authors_list', 'authors_list', 'r','ref_title_norm','title'}).to_pickle(dir_path + 'xml_analysis_out_v2.pkl')

## creating dataset and hand-crafted features

In [None]:
dblp_df = pd.read_pickle(dir_path + 'OAG_DBLP-Citation-network.pkl')

In [None]:
xml_analysis = pd.read_pickle(dir_path + 'xml_analysis_out_v2.pkl')

rulebase_label = pd.read_json(dir_path + 'paper_source_gen_by_rule.json')
valid_data = pd.read_json(dir_path + 'paper_source_trace_valid_wo_ans.json')
train_data = pd.read_json(dir_path + 'paper_source_trace_train_ans.json')
test_data = pd.read_json(dir_path + 'paper_source_trace_test_wo_ans.json')


train_data['train_or_valid'] = 'train'
valid_data['train_or_valid'] = 'valid'
test_data['train_or_valid'] = 'test'

data = pd.concat([train_data,valid_data,test_data])
data = data[['_id','refs_trace','train_or_valid']].copy()

xml_analysis = xml_analysis.merge(data, on = '_id', how = 'left')

data = pd.concat([train_data,valid_data,test_data])
df_exploded = data.explode('references')[['_id','references','refs_trace','train_or_valid']]

t1 = xml_analysis[['_id','ref_id']].drop_duplicates()
df_exploded.columns = ['_id','ref_id','refs_trace','train_or_valid']

t1['flg'] = 1

t2 = df_exploded.merge(t1,on = ['_id','ref_id'],how = 'left')
t2 = t2.loc[t2['flg'] != 1]

t2 = pd.merge(t2,dblp_df[['id', 'abstract','title','authors', 
                                     'venue', 'n_citation', 'keywords', 'year']],left_on = ['ref_id'], right_on = ['id'],how = 'left')


t2 = t2.rename(columns = {'abstract':'ref_abstract','title':'ref_title','authors':'ref_authors',
                            'venue':'ref_venue','id':'_ref_id','year':'ref_year',
                            'n_citation':'ref_n_citation','keywords':'ref_keywords'})

t2 = pd.merge(t2,dblp_df[['id', 'abstract','title','authors', 
                                     'venue', 'n_citation', 'keywords', 'year']],left_on = ['_id'], right_on = ['id'],how = 'left')

t2_adjusted = t2[xml_analysis.columns.intersection(t2.columns)]
result = pd.concat([xml_analysis, t2_adjusted], ignore_index=True)

xml_analysis = result.copy()

xml_analysis['ref_title'] = xml_analysis['ref_title'].fillna('')
df['ref_title'] = df['ref_title'].fillna('')

xml_analysis['ref_id'] = xml_analysis['ref_id'].fillna('_ref_idIsNaN')
xml_analysis['refs_trace'] = xml_analysis['refs_trace'].fillna('refs_traceIsNaN')

rulebase_label = pd.read_json(dir_path + 'paper_source_gen_by_rule.json')

rulebase_label = rulebase_label.T

def left_align_row(row):
    filtered = row.dropna().tolist()
    return filtered + [np.nan]*(len(row) - len(filtered))

df_aligned = rulebase_label.apply(left_align_row, axis=1)

df_aligned = pd.DataFrame(df_aligned).reset_index()

df_aligned.columns = ['_id', 'annotation_ref']

df_aligned['annotation_ref'] = df_aligned['annotation_ref'].apply(lambda x:x[0])

xml_analysis = xml_analysis.merge(df_aligned, on = '_id', how = 'left')

temp_xml_analysis = xml_analysis[['_id','ref_title','annotation_ref']].dropna().drop_duplicates().reset_index()
temp_xml_analysis['levenshtein_distance'] = 999

def levenshtein_distance(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]

for i in tqdm(range(0,len(temp_xml_analysis))):
        d = levenshtein_distance(temp_xml_analysis['ref_title'][i],temp_xml_analysis['annotation_ref'][i])
        temp_xml_analysis['levenshtein_distance'][i] = d

temp_xml_analysis['target'] = temp_xml_analysis.groupby('annotation_ref')['levenshtein_distance'].transform('min')

temp_xml_analysis['target'] = (temp_xml_analysis['levenshtein_distance'] == temp_xml_analysis['target']).astype(int)

xml_analysis = xml_analysis.reset_index()


temp_xml_analysis2 = xml_analysis.copy()
temp_xml_analysis2.drop(list(temp_xml_analysis['index']), inplace = True)

#xml_analysis['target'] = xml_analysis.apply(lambda row: 1 if row['ref_id'] in str(row['refs_trace']) else 0, axis=1)
temp_xml_analysis2['target'] = temp_xml_analysis2.apply(lambda row: 1 if row['ref_id'] in str(row['refs_trace']) else 0, axis=1)

temp_xml_analysis3 = pd.concat([temp_xml_analysis[['index','target']],temp_xml_analysis2[['index','target']]])
xml_analysis = xml_analysis.merge(temp_xml_analysis3[['index','target']], on = ['index'], how = 'left')

print(len(xml_analysis))
xml_analysis = pd.merge(xml_analysis,dblp_df[['id','abstract','title']],left_on=['_id'],right_on=['id'],how = 'left')
print(len(xml_analysis))

def update_target(group):
    if (group['target'] == 1).any():
        return group 
    else:
        group['target'] = -1  
    return group

xml_analysis = xml_analysis.groupby('_id').apply(update_target).reset_index(drop=True)

xml_analysis['Pre Text'] = xml_analysis['Pre Text'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
xml_analysis['Post Text'] = xml_analysis['Post Text'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
xml_analysis['Pre Text'] = xml_analysis['Pre Text'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
xml_analysis['Post Text'] = xml_analysis['Post Text'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
xml_analysis['Section'] = xml_analysis['Section'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
xml_analysis['ref_abstract'] = xml_analysis['ref_abstract'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
xml_analysis['abstract'] = xml_analysis['abstract'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
xml_analysis['ref_title'] = xml_analysis['ref_title'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
xml_analysis['title'] = xml_analysis['title'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
xml_analysis['venue'] = xml_analysis['venue'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
xml_analysis['ref_venue'] = xml_analysis['ref_venue'].str.lower().str.replace(r'[^\w\s]', '', regex=True)

xml_analysis.drop(columns = {'index'}).to_pickle(dir_path + 'dataset_v2.pkl')