In [6]:
# ! python -m spacy download en_core_web_sm

In [37]:
# ! pip install six --upgrade

In [49]:
! pip install scipy --upgrade

Collecting scipy
  Using cached scipy-1.4.1-cp36-cp36m-macosx_10_6_intel.whl (28.5 MB)
Installing collected packages: scipy
Successfully installed scipy-1.4.1


In [48]:
# ! conda uninstall scipy --y

In [2]:
import numpy as np
import os
import glob
import json
import pandas as pd
from pprint import pprint
import spacy
from scipy import stats
import re

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

In [186]:
main_path = "zetaobjects_v1_big/"

In [187]:
def load_paper(paper_id, papers_df):
    return papers_df[papers_df["za_id"] == paper_id]

In [188]:
def load_chunk_papers(chunk_path):
    sub_chunks = [f"{chunk_path}/{el}" for el in os.listdir(chunk_path)]
    return sub_chunks

In [189]:
def load_subchunk_info(subchunk_path):
    info_path = f"{subchunk_path}/main"
    try:
        with open(info_path, 'r') as j:
            paper_info = json.loads(j.read())
            return paper_info
    except FileNotFoundError:
        return {}

In [190]:
def load_subchunk_representations(subchunk_path):
    try:
        sentences_path = glob.glob( f"{subchunk_path}/representations/text_sentences*")[0]
    except IndexError:
        sentences_path = "None"
    try:
        embedding_path = glob.glob( f"{subchunk_path}/representations/document_embedding*")[0]
        with open(embedding_path, 'r') as j:
            embedding_dir = json.loads(j.read())["value"]["pointer"][6:]
    except IndexError:
        embedding_dir = "None"
    return {"fulltext_sentences_dir": sentences_path,
           "za_scibert_embedding_dir": embedding_dir}

In [191]:
def load_sentences(sentences_path):
    with open(sentences_path, 'r') as j:
        sentences = json.loads(j.read())["value"]
        # MORE PREPROCESSING!
    return sentences

In [192]:
def load_all_paper_data(main_path=main_path):
    all_chunks = os.listdir(main_path)
    all_subchunks = [el for sublist in [load_chunk_papers(f"{main_path}{chunk}") for chunk in all_chunks if chunk != "embeddings"] for el in sublist]
    all_papers_info = []
    
    def try_retrieve(source, value1, value2=None):
        if value2:
            try:
                return info[value1][value2]
            except KeyError:
                return "None"
        else:
            try:
                return info[value1]
            except KeyError:
                return "None"
    
    for subchunk in all_subchunks:
        info = load_subchunk_info(subchunk)
        representations = load_subchunk_representations(subchunk)
        paper_data = {
            "title": try_retrieve(info, "metadata", "DCMI.title"),
            "abstract": try_retrieve(info, "metadata", "DCMI.abstract"),
            "date": try_retrieve(info, "metadata", "DCMI.created"),
            "authors": try_retrieve(info, "metadata", "DCMI.creator"),
            "format": try_retrieve(info, "metadata", "DCMI.format"),
            "subjects": try_retrieve(info, "metadata", "DCMI.subject"),
            "za_id": try_retrieve(info, "guid"),
            "uri": try_retrieve(info, "uri"),
            "full_sentences_path": representations["fulltext_sentences_dir"],
            "full_scibert_embedding_path": representations["za_scibert_embedding_dir"]
        }
        all_papers_info.append(paper_data)
    
    papers_df = pd.DataFrame(all_papers_info)
    
    return papers_df

In [193]:
all_papers = load_all_paper_data()

In [197]:
len(all_papers)

116270

In [198]:
# all_papers.to_pickle("all_papers_big_metadata.pkl")

In [3]:
all_papers = pd.read_pickle("../Pickles/all_papers_with_splits_and_embeddings.pkl")

In [4]:
def get_author_fullname_length(row):
    author_names_value = row["authors"]
    author_names = [item["full_name"] for item in author_names_value]
    author_names_lengths = [len(author_name) for author_name in author_names]
    return author_names_lengths, author_names

In [5]:
# all_names_lengths = []
# all_names = []
# for index, row in all_papers.iterrows():
#     author = get_author_fullname_length(row)
#     author_lengths = author[0]
#     author_names = author[1]
#     all_names_lengths.extend(author_lengths)
#     all_names.extend(author_names)

In [6]:
def get_array_stats(arr):
    return stats.describe(arr)

In [8]:
# get_array_stats(all_names_lengths)

In [9]:
def normalize_name_whitespace(name):
    sides = name.strip()
    middle = re.sub('\s+', ' ', sides)
    return middle

In [10]:
normalize_name_whitespace(" Qi    Jang ")

'Qi Jang'

In [59]:
normalized_author_names = [normalize_name_whitespace(name) for name in all_names]

In [67]:
# short_names_nlped = [nlp(name) for name in short_names]

In [None]:
conference_names = ["", "", "", ]

In [14]:
def check_for_caps(name):
    return name.isupper()

def check_for_alpha(name):
    return name.isalpha()

def check_for_title(name):
    return any(substring in name.lower() for substring in ["phd"])

def check_for_organisations(name):
    # TODO: think of a way to remove stuff like CS conferences and journals from here.
    return any(substring in name.lower() for substring in ["ieee", "acm", "nips", "neurips", "acl", "emnlp"])

def check_valid_name(name):
    if not name.isupper() and all(x.isalpha() or x.isspace() for x in name.replace(".", "")) and not check_for_organisations(name) and not check_for_caps(name):
        return True
    else:
        return False

In [168]:
short_names = [name for name in normalized_author_names 
               if "Wang" in name
               and not check_for_caps(name) 
               and check_for_alpha(name) 
               and not check_for_title(name)]

In [11]:
all_papers.sample().authors.iloc[0]

[{'first_name': 'Thomas', 'last_name': 'Pinetz', 'full_name': 'Thomas Pinetz'},
 {'first_name': 'Daniel', 'last_name': 'Soukup', 'full_name': 'Daniel Soukup'},
 {'first_name': 'Thomas', 'last_name': 'Pock', 'full_name': 'Thomas Pock'}]

In [28]:
def retrieve_first_author(row):
    full_authors = row["authors"]
    if full_authors != "None":
        first_author = full_authors[0]
        if check_valid_name(first_author["full_name"]):
            return first_author
    return {}

In [29]:
# for index, row in all_papers.iterrows():
#     author = retrieve_first_author(row)
#     if check_valid_name(author["full_name"]):
#         print( author["full_name"])
#     else:
#         print("Poep")
#     break

In [30]:
all_papers["first_author"] = all_papers.apply(lambda x: retrieve_first_author(x), axis=1)

In [161]:
all_papers.iloc[3].authors

[{'first_name': 'A. J. P. M. P.',
  'full_name': 'A. J. P. M. P. Jayaweera',
  'last_name': 'Jayaweera'},
 {'first_name': 'N. G. J.', 'full_name': 'N. G. J. Dias', 'last_name': 'Dias'}]

In [33]:
# all_papers.to_pickle("../Pickles/all_papers_with_splits_embeddings_and_firstauthor.pkl")

In [169]:
short_names

['JinqiaoWang', 'Wang', 'Wang', 'Wang', 'Wang', 'Wang']

In [175]:
from collections import Counter
x = Counter(normalized_author_names)

In [184]:
x.most_common()[::-1][:10]

[('Carlos Ariel Diaz', 1),
 ('Osman Cihan Kilinc', 1),
 ('Y. Vorobeychik', 1),
 ('D. M. Reeves', 1),
 ('K. M. Lochner', 1),
 ('Xingbin Jiang', 1),
 ('Larry Chen', 1),
 ('Mikyas T. Desta', 1),
 ('Bruce R. Ellingwood', 1),
 ('Saeed Nozhati', 1)]