In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_parquet('data/hindawi_retracted.parquet.gzip')

In [None]:
df_refs = pd.read_parquet('data/hindawi_retracted_refs.parquet.gzip')

df_refs.dropna(subset=["auid"], inplace=True)
df_refs["auid"] = [[int(auid) for auid in auids if not np.isnan(auid)] for auids in df_refs.auid]

cite_counts = df_refs.groupby('ref_eid')['eid'].apply(lambda x: len(list(x))).to_frame("n_citations")
df_refs = df_refs.merge(cite_counts, on='ref_eid')

df_refs.head()

### 1. Looking at most cited articles.

In [None]:
N_CITED = 100

most_cited_df = df_refs.set_index('ref_eid').iloc[df_refs.drop_duplicates('ref_eid').n_citations.nlargest(N_CITED).keys()].reset_index().drop(columns=['eid'])
most_cited_df.head()

#### 1a. Articles with no citations that have been cited many times.

In [None]:
# Note: Some of these actually do have citations, just not visible in the AnI table. Some are also different format, e.g. books, for which the refs would be listed by chapter, hence the field would be empty here.
df_no_citations = df_refs[df_refs.citations.isna()].query('n_citations > 1').drop_duplicates('ref_eid').drop(columns=['eid']).sort_values('n_citations', ascending=False)
df_no_citations.head()

In [None]:
# Cited 91 times with no citation data or affiliation on record, suspicious authors (Huang, Sharma).
# https://www.informatica.si/index.php/informatica/article/view/3600
# Actually a special issue introduction, some papers are by known suspicious authors e.g. one by Sharma.
# There are a few more special issue introductions cited multiple times, e.g. 85108533647, 85106903979, 
print(df_no_citations.query('ref_eid == 85112251416').values)

In [None]:
# Case report with no citations, cited 27 times, by several retracted papers.
print(df_no_citations.query('ref_eid == 29312827').values)

### 2. Investigate citations in non-overlapping subject areas.

In [None]:
# Open in variable viewer and sort by `p_citations_same_subj`.
# If we assume the assigned labels are (mostly) exhaustive and reasonable, there's a substantial amount of papers that mostly cite unrelated papers.
def f(x):
    d = {}
    d['n_citations_with_data'] = len(x)
    d['p_citations_same_subj'] = np.mean(x['subj_overlap'])
    return pd.Series(d)

df_subj = df_refs.merge(df[["eid", "subjareas"]], on='eid')
df_subj["subj_overlap"] = df_subj.apply(lambda x: any(i in x['subjareas_y'] for i in x['subjareas_x']), axis=1)

df = df.merge(df_subj.groupby('eid').apply(f), on='eid')
df.head()

### 3. Investigating the simulation cluster. 

In [None]:
df_sim = df.query('cluster_simulation == 1')
df_sim.head()

#### 3b. Check the top editors.

In [None]:
# Y. Teekaraman is on the editorial board of some Elsevier journals: 
# https://www.journals.elsevier.com/ecotoxicology-and-environmental-safety/editorial-board/yuvaraja-teekaraman
# https://www.journals.elsevier.com/energy-reports/editorial-board/yuvaraja-teekaraman

df_sim.groupby('editor').apply(len).nlargest(10)

#### 3c. Check the top cited papers by these sim. cluster papers.

In [None]:
df_sim.explode('citations').groupby('citations').apply(len).nlargest(20)

In [None]:
# Search for shared affiliations among authors citing a particular EID.
# Don't see any outliers among the top few most cited.
cited_eid = df_sim.explode('citations').query('citations == 85112724790')
cited_eid["affiliation_organization"] = [[affil[-1] for affil in affils] for affils in cited_eid["affiliation_organization"]]
cited_eid.explode('affiliation_organization').groupby('affiliation_organization').apply(len).sort_values(ascending=False)