# Aminer (AP_train) — Quickstart EDA (Beginner)

This mini-notebook is tuned for the files **`AP_train.txt`** and **`readme.txt`** you were given.
- Simple parsing (line by line), simple dicts and loops.
- Plots with matplotlib (no seaborn), one chart per figure.
- Start with a subset for speed, then run full.


In [None]:
# --- CONFIG ---
DATA_PATH   = 'AP_train.txt'  # set to your file
README_PATH = 'readme.txt'    # optional; shows format info
MAX_RECORDS = None            # e.g., 200000 for testing; None for full

import time
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
plt.rcParams['figure.figsize'] = (7,4)


## Peek at README (optional)
This prints the first ~40 lines so you can confirm the expected file format.

In [None]:
try:
    with open(README_PATH, 'r', encoding='utf-8', errors='ignore') as f:
        for i, line in zip(range(40), f):
            print(line.rstrip())
except FileNotFoundError:
    print('readme.txt not found; put it next to this notebook if you want to preview it.')


## Parser (Aminer format)
We handle Aminer markers like `#index` (paper id), `#*` (title), `#@` (authors), `#t` (year), `#c` (venue), `#%` (reference id).

In [None]:
def parse_aminer(path, max_records=None):
    author_pub_count = defaultdict(int)
    venue_pub_count  = defaultdict(int)
    pub_ref_count    = {}
    citations_count  = defaultdict(int)
    title_by_pub     = {}
    year_by_pub      = {}
    venue_by_pub     = {}

    total_pubs = 0
    total_refs = 0

    cur_id = None
    cur_title = ''
    cur_authors = []
    cur_year = None
    cur_venue = ''
    cur_refs = 0

    def finalize_current():
        nonlocal total_pubs, cur_id, cur_title, cur_authors, cur_year, cur_venue, cur_refs
        if cur_id is None:
            return
        title_by_pub[cur_id] = cur_title
        year_by_pub[cur_id] = cur_year
        venue_by_pub[cur_id] = cur_venue
        pub_ref_count[cur_id] = cur_refs
        total_pubs += 1
        for a in cur_authors:
            if a:
                author_pub_count[a] += 1
        if cur_venue:
            venue_pub_count[cur_venue] += 1

    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.rstrip('\n')
            if not line:
                continue
            if line.startswith('#index'):
                finalize_current()
                cur_id = line[6:].strip()
                cur_title = ''
                cur_authors = []
                cur_year = None
                cur_venue = ''
                cur_refs = 0
                if max_records is not None and total_pubs >= max_records:
                    break
            elif line.startswith('#*'):
                cur_title = line[2:].strip()
            elif line.startswith('#@'):
                s = line[2:].strip()
                cur_authors = [a.strip() for a in s.split(';') if a.strip()] if s else []
            elif line.startswith('#t'):
                s = line[2:].strip()
                try: cur_year = int(s)
                except: cur_year = None
            elif line.startswith('#c'):
                cur_venue = line[2:].strip()
            elif line.startswith('#%'):
                ref_id = line[2:].strip()
                if ref_id:
                    citations_count[ref_id] += 1
                    cur_refs += 1
                    total_refs += 1
        finalize_current()

    return {
        'author_pub_count': dict(author_pub_count),
        'venue_pub_count':  dict(venue_pub_count),
        'pub_ref_count':    pub_ref_count,
        'citations_count':  dict(citations_count),
        'title_by_pub':     title_by_pub,
        'year_by_pub':      year_by_pub,
        'venue_by_pub':     venue_by_pub,
        'total_publications': total_pubs,
        'total_references': total_refs,
    }


In [None]:
# Run parser
start = time.time()
data = parse_aminer(DATA_PATH, MAX_RECORDS)
print(f'Parsed in {time.time()-start:.1f}s; publications={data["total_publications"]:,}, refs={data["total_references"]:,}')

# Quick peek of first 3 papers
ids = list(data['title_by_pub'].keys())[:3]
for pid in ids:
    print('\nID:', pid)
    print(' title:', data['title_by_pub'].get(pid,''))
    print(' year :', data['year_by_pub'].get(pid))
    print(' venue:', data['venue_by_pub'].get(pid,''))
    print(' refs :', data['pub_ref_count'].get(pid,0), ' | cites:', data['citations_count'].get(pid,0))


## 3.1 Basic Counts

In [None]:
authors = data['author_pub_count']
venues  = data['venue_pub_count']
print('Distinct authors :', len(authors))
print('Distinct venues  :', len(venues))
print('Publications     :', data['total_publications'])
print('References (edges):', data['total_references'])
print('Citations (sum)  :', sum(data['citations_count'].values()))


## 3.2 Histograms & Stats (authors / venues)
We use log scale on **y** to handle heavy tails.

In [None]:
def basic_stats(arr):
    arr = np.asarray(arr, dtype=float)
    if arr.size == 0:
        return np.nan, np.nan, np.nan, np.nan, np.nan
    mean = float(np.mean(arr))
    std  = float(np.std(arr, ddof=0))
    q1, med, q3 = np.percentile(arr, [25, 50, 75])
    return mean, std, q1, med, q3

# Publications per author
a_counts = np.array(list(authors.values()), dtype=float)
print('Authors stats (mean, std, Q1, median, Q3):', basic_stats(a_counts))
plt.figure(); plt.hist(a_counts, bins=50); plt.yscale('log'); plt.xlabel('pubs/author'); plt.ylabel('freq (log)'); plt.title('Publications per Author'); plt.show()

# Publications per venue
v_counts = np.array(list(venues.values()), dtype=float)
print('Venues stats  (mean, std, Q1, median, Q3):', basic_stats(v_counts))
if len(venues):
    top_v, top_n = max(venues.items(), key=lambda kv: kv[1])
    print('Top venue by publications:', top_v, top_n)
plt.figure(); plt.hist(v_counts, bins=50); plt.yscale('log'); plt.xlabel('pubs/venue'); plt.ylabel('freq (log)'); plt.title('Publications per Venue'); plt.show()


## 3.3 References, Citations, Impact

In [None]:
titles   = data['title_by_pub']; years = data['year_by_pub']; venues_by = data['venue_by_pub']
refs_map = data['pub_ref_count']; cites_map = data['citations_count']
all_ids  = list(titles.keys())

refs = np.array(list(refs_map.values()), dtype=float)
cites = np.array([cites_map.get(pid,0) for pid in all_ids], dtype=float)

plt.figure(); plt.hist(refs, bins=50); plt.yscale('log'); plt.xlabel('refs/paper'); plt.ylabel('freq (log)'); plt.title('References per Publication'); plt.show()
plt.figure(); plt.hist(cites, bins=50); plt.yscale('log'); plt.xlabel('cites/paper'); plt.ylabel('freq (log)'); plt.title('Citations per Publication'); plt.show()

max_ref_id = max(refs_map, key=refs_map.get) if refs_map else None
max_cit_id = max(all_ids, key=lambda pid: cites_map.get(pid,0)) if all_ids else None
print('Most references ->', max_ref_id, refs_map.get(max_ref_id,0), '|', titles.get(max_ref_id,''))
print('Most citations  ->', max_cit_id, cites_map.get(max_cit_id,0), '|', titles.get(max_cit_id,''))

# Venue impact (all venues)
from collections import defaultdict
venue_cite_sum = defaultdict(int)
for pid in all_ids:
    v = venues_by.get(pid,'')
    if v:
        venue_cite_sum[v] += cites_map.get(pid,0)
impact = {v: (venue_cite_sum.get(v,0) / n) for v, n in venues.items() if n>0}
imp_vals = np.array(list(impact.values()), dtype=float)
plt.figure(); plt.hist(imp_vals, bins=50); plt.yscale('log'); plt.xlabel('impact'); plt.ylabel('freq (log)'); plt.title('Venue Impact (all)'); plt.show()
if impact:
    best_v, best_if = max(impact.items(), key=lambda kv: kv[1])
    print('Highest impact venue (all):', best_v, f'{best_if:.2f}')

# Impact with >=10 pubs
imp10 = {v: (venue_cite_sum.get(v,0) / n) for v, n in venues.items() if n>=10}
imp10_vals = np.array(list(imp10.values()), dtype=float)
plt.figure(); plt.hist(imp10_vals, bins=50); plt.yscale('log'); plt.xlabel('impact (>=10 pubs)'); plt.ylabel('freq (log)'); plt.title('Venue Impact (>=10 pubs)'); plt.show()
if imp10:
    best_v10, best_if10 = max(imp10.items(), key=lambda kv: kv[1])
    print('Highest impact venue (>=10 pubs):', best_v10, f'{best_if10:.2f}')
    counts = [cites_map.get(pid,0) for pid in all_ids if venues_by.get(pid,'')==best_v10]
    if counts:
        print('Mean vs median citations in that venue:', float(np.mean(counts)), float(np.median(counts)))
        print('First 50 citation counts:', counts[:50])


## 3.3(e) Time Trends
Average references and citations per publication by year.

In [None]:
year_to_ids = defaultdict(list)
for pid, yr in years.items():
    if isinstance(yr, int):
        year_to_ids[yr].append(pid)
yrs = sorted(year_to_ids.keys())
avg_refs, avg_cites = [], []
for y in yrs:
    ids = year_to_ids[y]
    avg_refs.append(float(np.mean([refs_map.get(pid,0) for pid in ids])) if ids else 0.0)
    avg_cites.append(float(np.mean([cites_map.get(pid,0) for pid in ids])) if ids else 0.0)
plt.figure(); plt.plot(yrs, avg_refs, marker='o'); plt.xlabel('Year'); plt.ylabel('Avg refs'); plt.title('Avg References per Pub by Year'); plt.grid(True); plt.show()
plt.figure(); plt.plot(yrs, avg_cites, marker='o'); plt.xlabel('Year'); plt.ylabel('Avg cites'); plt.title('Avg Citations per Pub by Year'); plt.grid(True); plt.show()


### Write your explanations in markdown cells:
- 3.1(b): Venue name variants split counts (normalize names or map aliases).
- 3.2(b): Mean vs median differences due to skew/heavy tails.
- 3.3(c–d): Small venues inflate impact; filtering (>=10 pubs) stabilizes. Compare mean vs median.
- 3.3(e): Citations accumulate over time; references decided at publication time.
