In [1]:
import pandas as pd
import json
import os
import boto3
import csv
import glob
import sys

In [2]:
# !pip install faiss-cpu
# !pip install h5py==2.8.0 (https://stackoverflow.com/questions/39927206/yum-install-libhdf5-dev-on-amazon-linux)
# !pip install ujson
# !pip install nameparser
# !pip install swifter
# !pip install duckdb
# !pip install sentence-transformers
# !pip install gputil
# !pip install networkx

In [3]:
import tempfile
import unittest

from bert_auth_disamb.libs.disambert import disambert
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import GPUtil
from scipy import sparse

In [4]:
citation_embedding_model_file = "./citation_embedding_model/"
text_embedding_model_file = "./text_embedding_model/"
test_file_location = "test_data.json"

In [None]:
cit_vectorizer = disambert.CitationVectorizer(
    filename=citation_embedding_model_file, data_attr="references"
)
text_vectorizer = disambert.TextVectorizer(
    text_embedding_model_file,
    device="all",
    data_attrs=[
        ["affiliation", "title"],
        ["author", "coauthors", "affiliation", "title"],
        ["author", "coauthors"],
        ["author", "affiliation", "title"],
        ["title"],
    ],
    batch_size=256
)
meta_vectorizer = disambert.MetaVectorizer(
    vectorizers=[text_vectorizer, cit_vectorizer]
)

In [6]:
s3_client = boto3.client("s3")
bucket_name = "author-disambiguation"
paginator = s3_client.get_paginator("list_objects_v2")
response = paginator.paginate(Bucket=bucket_name, Prefix="V1/all_data_blocked_and_partitioned/",
                              PaginationConfig={"PageSize": 50})

filenames = []

for page in response:
    files = page.get("Contents")
    for file in files:
        if file['Key'].endswith(".json"):
            filenames.append(file['Key'])

In [7]:
filenames.sort()

In [8]:
def replace_empty_lists_with_None(col_list):
    if isinstance(col_list, list):
        if col_list:
            return col_list
        else:
            return None
    else:
        return None

In [9]:
def replace_empty_affiliations(col_list):
    if isinstance(col_list, list):
        if len(col_list) > 0:
            if col_list[0] == "":
                return None
            else:
                return " ".join(col_list)
        else:
            return None
    else:
        return None

In [13]:
def get_data_vectorized(curr_file_name):
    print("-------reading data")
    df = pd.read_json(f"s3://author-disambiguation/{curr_file_name}",
                      orient='records', lines=True)
    
    df['coauthors'] = df['coauthors'].apply(replace_empty_lists_with_None)
    df['references'] = df['references'].apply(replace_empty_lists_with_None)
    df['affiliation'] = df['affiliation'].apply(replace_empty_affiliations)
    df['author_id'] = df['mag_author_id']
    
    df = df[df['author']!=''][['UID', 'title', 'pub_year', 'journal', 'author',
           'paper_author_id', 'affiliation', 'seq_no', 'coauthors', 'references',
           'author_id']].copy()
    
    file_idx = curr_file_name.split("/")[2].split("=")[1]
    
    print("-------saving processed data")
    # save into correct json
    raw_data_file = f"./temp_file_loc/data_file_partition_{file_idx}.json"
    df.to_json(raw_data_file, orient='records', lines=True)

    print("-------starting blocking")
    # block
    if os.path.exists("./temp_file_loc/temp_block_file_location_1"):
        os.remove("./temp_file_loc/temp_block_file_location_1")
    if os.path.exists("./temp_file_loc/temp_block_file_location_1.wal"):
        os.remove("./temp_file_loc/temp_block_file_location_1.wal")
    blocked_data_file = "./temp_file_loc/temp_block_file_location_1"
    blocking_model = disambert.BlockingByName(name_attr="author")
    blocked_dataset = disambert.BlockedDataset(chunksize=1000)
    blocked_dataset.blocking(
        blocking_model,
        raw_data_file,
        blocked_data_file
    )
    
    print("-------starting vectorizing")
    # vectorize
    vector_data_file = f"./temp_file_loc/vector_file_partition_{file_idx}"
    blocked_dataset.reset_iterator()
    vector_dataset = disambert.BlockedVectorDataset(
        filename_or_dataset=blocked_dataset,
        block_id_key="_block_id",
        vectorizer=meta_vectorizer,
        data_id_key="paper_author_id",
        attribute_keys=["author_id"],
        chunksize=1000
    )
    vector_dataset.save(vector_data_file, batch_size=4096)
    
    print("-------transferring to S3")
    # transfer to S3
    os.system(f"aws s3 cp {vector_data_file} s3://author-disambiguation/V1/vectorized_data/partition_{file_idx}/ --no-progress")
    os.system(f"aws s3 cp {raw_data_file} s3://author-disambiguation/V1/vectorized_data/partition_{file_idx}/ --no-progress")
    
    print("-------delete files")
    # delete files
    os.remove(vector_data_file)
    os.remove(raw_data_file)
    print("")

In [None]:
%%time
for i in filenames[2:15]:
    print(i)
    get_data_vectorized(i)

V1/all_data_blocked_and_partitioned/random_partition_number=10/part-00000-tid-767740779014940659-7ce3996d-ba3a-475b-ac9c-f6d3b3390d40-8518-11.c000.json
-------reading data
-------saving processed data
-------starting blocking


0it [00:00, ?it/s]

-------starting vectorizing


  0%|          | 0/287 [00:00<?, ?it/s]

-------transferring to S3
upload: temp_file_loc/vector_file_partition_10 to s3://author-disambiguation/V1/vectorized_data/partition_10/vector_file_partition_10
upload: temp_file_loc/data_file_partition_10.json to s3://author-disambiguation/V1/vectorized_data/partition_10/data_file_partition_10.json
-------delete files

V1/all_data_blocked_and_partitioned/random_partition_number=100/part-00000-tid-767740779014940659-7ce3996d-ba3a-475b-ac9c-f6d3b3390d40-8518-101.c000.json
-------reading data
-------saving processed data
-------starting blocking


0it [00:00, ?it/s]

-------starting vectorizing


  0%|          | 0/295 [00:00<?, ?it/s]

-------transferring to S3
Completed 9.7 GiB/15.8 GiB (98.2 MiB/s) with 1 file(s) remaining   

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



-------starting vectorizing


  0%|          | 0/351 [00:00<?, ?it/s]

### Clustering

In [5]:
s3_client = boto3.client("s3")
bucket_name = "author-disambiguation"
paginator = s3_client.get_paginator("list_objects_v2")
response = paginator.paginate(Bucket=bucket_name, Prefix="V1/vectorized_data/",
                              PaginationConfig={"PageSize": 50})

filenames = []

for page in response:
    files = page.get("Contents")
    for file in files:
        if "vector_file" in file['Key']:
            filenames.append(file['Key'])

In [6]:
filenames.sort()

In [7]:
filenames[:10]

['V1/vectorized_data/partition_0/vector_file_partition_0',
 'V1/vectorized_data/partition_1/vector_file_partition_1',
 'V1/vectorized_data/partition_10/vector_file_partition_10',
 'V1/vectorized_data/partition_100/vector_file_partition_100',
 'V1/vectorized_data/partition_101/vector_file_partition_101',
 'V1/vectorized_data/partition_102/vector_file_partition_102',
 'V1/vectorized_data/partition_103/vector_file_partition_103',
 'V1/vectorized_data/partition_104/vector_file_partition_104',
 'V1/vectorized_data/partition_105/vector_file_partition_105',
 'V1/vectorized_data/partition_106/vector_file_partition_106']

In [6]:
#
# Clustering
#
device = None
clustering_model_file = "./clustering_model/"

clustering = disambert.PottsClustering(
    num_neighbors=100,
    device=device,
    filename=clustering_model_file,
)

In [7]:
def get_clustered_data(file_name):
    
    print("-------pulling data from S3")
    os.system(f"aws s3 cp s3://author-disambiguation/{file_name} ./temp_file_loc/ --no-progress")
    
    file_idx = file_name.split('/')[-1].split('_')[-1]
    vector_data_file = f"./temp_file_loc/{file_name.split('/')[-1]}"
    output_file = f"./temp_file_loc/cluster_file_partition_{file_idx}"

    #
    # Load the vector dataset
    #
    vector_dataset = disambert.BlockedVectorDataset(
        filename_or_dataset=vector_data_file,
        block_id_key="_block_id",
        chunksize=2200000,
    )
    
    first_write = True
    n_data_samples = len(vector_dataset)
    data_keys = clustering.vector_keys
    
    print("-------clustering and writing to file")
    with open(output_file, "w") as f:
        writer = csv.writer(f)
        writer.writerow(["block_id", "data_id", "cluster_id"])
#         with tqdm(total=n_data_samples) as progress:
        for data_ids, data in vector_dataset.get_one_block():
            if len(data_ids) == 1:
                cids = np.zeros_like(data_ids)
                writer.writerow([data["_block_id"][0], data_ids[0], 0])
#                 progress.update(len(data_ids))
                continue

            for k in data_keys:
                data[k] = torch.tensor(data[k], dtype=torch.float32)
            cids = clustering.transform(data)
            block_ids = data["_block_id"]
            block_ids, data_ids, cids = (
                data["_block_id"].tolist(),
                data_ids.tolist(),
                cids.tolist(),
            )
            writer.writerows(
                [
                    [block_ids[i], data_ids[i], int(cids[i])]
                    for i in range(len(block_ids))
                ]
            )
#             progress.update(len(data_ids))
                
    print("-------transferring to S3")
    # transfer to S3
    os.system(f"aws s3 cp {output_file} s3://author-disambiguation/V1/vectorized_data/partition_{file_idx}/ --no-progress")
    
    print("-------delete files")
    # delete files
    os.remove(vector_data_file)
    os.remove(output_file)
    print("")


In [8]:
import pickle

In [9]:
with open("unsuccessful_filenames_1.pkl", "rb") as f:
    files_for_later_1 = pickle.load(f)

In [10]:
with open("unsuccessful_filenames_2.pkl", "rb") as f:
    files_for_later_2 = pickle.load(f)

In [11]:
filenames = files_for_later_1 + files_for_later_2

In [12]:
%%time
files_for_later = []
for i in filenames:
    print(i)
    try:
        get_clustered_data(i)
    except:
        print("---------------------- ERROR ----------------------")
        vec_file = f"./temp_file_loc/{i.split('/')[-1]}"
        out_file = f"./temp_file_loc/cluster_file_partition_{i.split('/')[-1].split('_')[-1]}"
        if os.path.exists(vec_file):
            os.remove(vec_file)
        if os.path.exists(out_file):
            os.remove(out_file)
        files_for_later.append(i)

print(f"Completed files list, {len(files_for_later)} unsuccessful")
        

V1/vectorized_data/partition_336/vector_file_partition_336
-------pulling data from S3
download: s3://author-disambiguation/V1/vectorized_data/partition_336/vector_file_partition_336 to temp_file_loc/vector_file_partition_336
-------clustering and writing to file
-------transferring to S3
upload: temp_file_loc/cluster_file_partition_336 to s3://author-disambiguation/V1/vectorized_data/partition_336/cluster_file_partition_336
-------delete files

V1/vectorized_data/partition_362/vector_file_partition_362
-------pulling data from S3
download: s3://author-disambiguation/V1/vectorized_data/partition_362/vector_file_partition_362 to temp_file_loc/vector_file_partition_362
-------clustering and writing to file
-------transferring to S3
upload: temp_file_loc/cluster_file_partition_362 to s3://author-disambiguation/V1/vectorized_data/partition_362/cluster_file_partition_362
-------delete files

V1/vectorized_data/partition_367/vector_file_partition_367
-------pulling data from S3
download: s3:

-------clustering and writing to file
-------transferring to S3
upload: temp_file_loc/cluster_file_partition_185 to s3://author-disambiguation/V1/vectorized_data/partition_185/cluster_file_partition_185
-------delete files

V1/vectorized_data/partition_188/vector_file_partition_188
-------pulling data from S3
download: s3://author-disambiguation/V1/vectorized_data/partition_188/vector_file_partition_188 to temp_file_loc/vector_file_partition_188
-------clustering and writing to file
-------transferring to S3
upload: temp_file_loc/cluster_file_partition_188 to s3://author-disambiguation/V1/vectorized_data/partition_188/cluster_file_partition_188
-------delete files

V1/vectorized_data/partition_225/vector_file_partition_225
-------pulling data from S3
download: s3://author-disambiguation/V1/vectorized_data/partition_225/vector_file_partition_225 to temp_file_loc/vector_file_partition_225
-------clustering and writing to file
-------transferring to S3
upload: temp_file_loc/cluster_file_p

In [13]:
with open("unsuccessful_filenames_3.pkl", "wb") as f:
    pickle.dump(files_for_later, f)

In [25]:
for i in files_for_later_1:
    print(i)

V1/vectorized_data/partition_336/vector_file_partition_336
V1/vectorized_data/partition_362/vector_file_partition_362
V1/vectorized_data/partition_367/vector_file_partition_367
V1/vectorized_data/partition_393/vector_file_partition_393
V1/vectorized_data/partition_416/vector_file_partition_416
V1/vectorized_data/partition_434/vector_file_partition_434
V1/vectorized_data/partition_452/vector_file_partition_452
V1/vectorized_data/partition_463/vector_file_partition_463
V1/vectorized_data/partition_467/vector_file_partition_467
V1/vectorized_data/partition_48/vector_file_partition_48
V1/vectorized_data/partition_489/vector_file_partition_489
V1/vectorized_data/partition_496/vector_file_partition_496
V1/vectorized_data/partition_54/vector_file_partition_54
V1/vectorized_data/partition_56/vector_file_partition_56
V1/vectorized_data/partition_61/vector_file_partition_61
V1/vectorized_data/partition_74/vector_file_partition_74
V1/vectorized_data/partition_83/vector_file_partition_83


In [23]:
for i in files_for_later_2:
    print(i)

V1/vectorized_data/partition_183/vector_file_partition_183
V1/vectorized_data/partition_185/vector_file_partition_185
V1/vectorized_data/partition_188/vector_file_partition_188
V1/vectorized_data/partition_225/vector_file_partition_225
V1/vectorized_data/partition_228/vector_file_partition_228
V1/vectorized_data/partition_244/vector_file_partition_244
V1/vectorized_data/partition_25/vector_file_partition_25
V1/vectorized_data/partition_283/vector_file_partition_283
V1/vectorized_data/partition_284/vector_file_partition_284
V1/vectorized_data/partition_286/vector_file_partition_286
