In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_parquet('/Users/lyk/gnn_clustering_citation_cartels/data/hindawi_retracted.parquet.gzip')

In [None]:
df_refs = pd.read_parquet('/Users/lyk/gnn_clustering_citation_cartels/data/hindawi_retracted_refs.parquet.gzip')

df_refs.dropna(subset=["auid"], inplace=True)
df_refs["auid"] = [[int(auid) for auid in auids if not np.isnan(auid)] for auids in df_refs.auid]

cite_counts = df_refs.groupby('ref_eid')['eid'].apply(lambda x: len(list(x))).to_frame("n_citations")
df_refs = df_refs.merge(cite_counts, on='ref_eid')

df_refs.head()

### 1. Top N referenced EIDs by these retracted papers.

In [None]:
N_CITED = 100

most_cited_df = df_refs.set_index('ref_eid').iloc[df_refs.drop_duplicates('ref_eid').n_citations.nlargest(N_CITED).keys()].reset_index().drop(columns=['eid'])
most_cited_df.head()

most_cited_df['title'] = most_cited_df.title.apply(lambda x: x[0])

most_cited_df[['ref_eid', 'title', 'Au_unique_IN', 'n_citations']].to_excel('most_referenced_eids_by_retracted.xlsx')
most_cited_df.head()

### 2. Top N referenced authors by retracted papers, and a list of their papers that were referenced.

In [None]:
# Just mapping author names to IDs.
def f(x):
    if x['auid'] is None or x['Au_unique_IN'] is None:
        return None
    else:
        return [(i, j) for i, j in zip(x['auid'], x['Au_unique_IN'])]

df_refs['authors'] = df_refs.apply(f, axis=1)

In [None]:
N_AUTHORS = 100

most_cited_authors = df_refs.explode('authors').groupby('authors').apply(len).nlargest(N_AUTHORS)
most_cited_authors_papers = df_refs.explode('authors').set_index('authors') \
    .loc[most_cited_authors.keys()].groupby('authors')['ref_eid'] \
    .apply(lambda x: list(set(x))).loc[most_cited_authors.keys()]
most_cited_authors_df = pd.concat([most_cited_authors, most_cited_authors_papers], axis=1) \
    .rename(columns={0: 'n_cites_received'})

most_cited_authors_df.to_excel('most_referenced_authors_and_their_papers.xlsx')
most_cited_authors_df.head()

### 3. % of references to papers with non-overlapping subject areas (citation stacking signal).

In [None]:
# Open in variable viewer and sort by `p_citations_overlap_subj`.
# If we assume the `subjareas` are (mostly) exhaustive, there's a substantial amount of papers that mostly reference unrelated papers.
def f(x):
    d = {}
    d['n_citations_with_scopus_data'] = len(x)
    d['p_citations_overlap_subj'] = np.mean(x['subj_overlap'])
    return pd.Series(d)

df_subj = df_refs.merge(df[["eid", "subjareas"]], on='eid')
df_subj["subj_overlap"] = df_subj.apply(lambda x: any(i in x['subjareas_y'] for i in x['subjareas_x']), axis=1)

df = df.merge(df_subj.groupby('eid').apply(f), on='eid')
df.head()

### 4. A list of all special issue introductions referenced by the retracted papers, some of which are suspicious.

In [None]:
special_issues = df_refs[df_refs.title.apply(lambda x: x[0]).str.lower().str.contains('special issue')].drop_duplicates('ref_eid')
special_issues = special_issues.sort_values(by='n_citations', ascending=False)
special_issues['title'] = special_issues.title.apply(lambda x: x[0])

special_issues[['ref_eid', 'title', 'Au_unique_IN', 'n_citations']].to_excel('special_issue_intros_referenced_by_retracted.xlsx')
special_issues.head()

### 5. Editors within the top 50 most frequently-occuring editors of these retracted papers, who are also on the editorial board of some Elsevier journal. 

In [None]:
# First checking the top 50, then manually searching the names for Elsevier affiliation. 
df.groupby('editor').apply(len).nlargest(50)

In [None]:
# The retracted papers edited by the editors in question.
els_editors_in_top_50 = [
    'Zhihan Lv',
    'Yuvaraja Teekaraman',
    'Rashid A Saeed',
    'Vijay Kumar',
    'Xin Ning',
    'Zhiguo Qu',
    'Danilo Pelusi',
    'Gang Chen',
]

els_editors = df[df.editor.apply(lambda x: True if x in els_editors_in_top_50 else False)]
els_editors = els_editors.sort_values('editor')

els_editors[['eid', 'title', 'Au_unique_IN', 'editor']].to_excel('retracted_eids_by_els_editors.xlsx')
els_editors.head()

In [None]:
df_refs.groupby('publishername').apply(len).nlargest(20)

### 6. Top N referenced EIDs published in an Elsevier journal.

In [None]:
cited_elsevier = df_refs[df_refs.publishername.apply(lambda x: (True if "Elsevier" in x else False) if x is not None else False)].drop_duplicates('ref_eid').sort_values('n_citations', ascending=False)
cited_elsevier.title = cited_elsevier.title.apply(lambda x: x[0])

cited_elsevier[:N_CITED][['ref_eid', 'title', 'auid', 'Au_unique_IN', 'sourcetitle', 'publishername', 'n_citations']].to_excel('most_cited_elsevier_published.xlsx')
cited_elsevier.head()