# 01 â€” AfriSenti Twitter Sentiment: Initial EDA
**Goals:** language distribution, text length analysis, and label imbalance. Saves plots/tables for your report.

## 1. Setup & Imports

In [None]:

import os, sys, json, re
from pathlib import Path
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns

plt.rcParams['figure.dpi'] = 140
plt.rcParams['axes.grid'] = True
plt.rcParams['font.family'] = 'DejaVu Sans'

ROOT = Path.cwd().parents[0] if (Path.cwd().name == 'notebooks') else Path.cwd()
DATA_DIR = ROOT / 'data'
FIG_DIR = ROOT / 'outputs' / 'figures'
TAB_DIR = ROOT / 'outputs' / 'tables'
os.makedirs(FIG_DIR, exist_ok=True); os.makedirs(TAB_DIR, exist_ok=True)

print('ROOT:', ROOT)
print('Python:', sys.version)


## 2. Load Data
Choose one: local CSVs in `../data/` or HuggingFace datasets.

In [None]:

# Option A: Local CSVs
def smart_read_csv(path):
    try: return pd.read_csv(path)
    except UnicodeDecodeError: return pd.read_csv(path, encoding='utf-8-sig')
    except Exception: return pd.read_csv(path, sep='\t')

csvs = list(DATA_DIR.glob('*.csv'))
if not csvs:
    print('No CSVs found in data/. Please add dataset files.')
    df = pd.DataFrame(columns=['text','label','lang'])
else:
    frames = []
    for p in csvs:
        d = smart_read_csv(p)
        cols = {c.lower(): c for c in d.columns}
        text_col = cols.get('text') or cols.get('tweet') or list(d.columns)[0]
        label_col = cols.get('label') or cols.get('sentiment')
        lang_col  = cols.get('lang') or cols.get('language')
        tmp = pd.DataFrame()
        tmp['text'] = d[text_col].astype(str)
        tmp['label'] = d[label_col] if label_col else pd.NA
        if lang_col: tmp['lang'] = d[lang_col].astype(str)
        else:
            name = p.stem.lower()
            tmp['lang'] = 'sw' if 'sw' in name else ('am' if 'am' in name else ('en' if 'en' in name else 'unk'))
        frames.append(tmp[['text','label','lang']])
    df = pd.concat(frames, ignore_index=True)
print('Loaded shape:', df.shape)
df.head()


In [None]:

# Option B: HuggingFace (edit dataset/config IDs and uncomment)
# from datasets import load_dataset
# ds_sw = load_dataset('Davlan/afrisent-semeval-2023', 'sw', split='train')
# ds_am = load_dataset('Davlan/afrisent-semeval-2023', 'am', split='train')
# ds_en = load_dataset('Davlan/afrisent-semeval-2023', 'en', split='train')
# df = pd.concat([ds_sw.to_pandas().assign(lang='sw'),
#                 ds_am.to_pandas().assign(lang='am'),
#                 ds_en.to_pandas().assign(lang='en')], ignore_index=True)
# label_map = {0:'negative', 1:'neutral', 2:'positive'}
# if 'label' in df.columns and pd.api.types.is_numeric_dtype(df['label']):
#     df['label'] = df['label'].map(label_map)
# print('Loaded from HF:', df.shape)


## 3. Clean & Sanity Checks

In [None]:

orig_shape = df.shape
missing = df.isna().sum()
dups = df.duplicated(subset=['text','label','lang']).sum()
print('Original:', orig_shape, '\nMissing:\n', missing, '\nDuplicates:', dups)

df = df.drop_duplicates(subset=['text','label','lang'], keep='first')
df['text'] = df['text'].astype(str).str.replace('\s+', ' ', regex=True).str.strip()
if 'label' in df.columns: df['label'] = df['label'].astype(str)
df['lang'] = df['lang'].astype(str)

print('After dropping dupes:', df.shape)
df.sample(5, random_state=42)


## 4. Language Distribution (Task 2a)

In [None]:

lang_counts = df['lang'].value_counts().sort_index()
print(lang_counts)

plt.figure(figsize=(6,4))
sns.barplot(x=lang_counts.index, y=lang_counts.values)
plt.title('Language Distribution'); plt.xlabel('Language'); plt.ylabel('Count')
plt.tight_layout(); plt.savefig(FIG_DIR/'lang_distribution.png', dpi=150); plt.show()

lang_counts.to_csv(TAB_DIR/'lang_counts.csv')


## 5. Text Length Analysis (Task 2b)

In [None]:

df['char_len'] = df['text'].str.len()
df['tok_len'] = df['text'].str.split().apply(len)

plt.figure(figsize=(10,4))
plt.subplot(1,2,1); plt.hist(df['char_len'], bins=50); plt.title('Char length')
plt.subplot(1,2,2); plt.hist(df['tok_len'], bins=50); plt.title('Token length')
plt.tight_layout(); plt.savefig(FIG_DIR/'length_hist_overall.png', dpi=150); plt.show()

plt.figure(figsize=(7,4))
sns.boxplot(data=df, x='lang', y='tok_len')
plt.title('Token length by language'); plt.xlabel('Language'); plt.ylabel('Tokens per tweet')
plt.tight_layout(); plt.savefig(FIG_DIR/'length_box_by_lang.png', dpi=150); plt.show()

df[['lang','char_len','tok_len']].groupby('lang').describe().to_csv(TAB_DIR/'length_stats_by_lang.csv')


## 6. Label Imbalance (Task 2c)

In [None]:

if 'label' in df.columns and df['label'].notna().any():
    label_counts = df['label'].value_counts().sort_index()
    plt.figure(figsize=(6,4))
    sns.barplot(x=label_counts.index, y=label_counts.values)
    plt.title('Overall Label Distribution'); plt.xlabel('Label'); plt.ylabel('Count')
    plt.tight_layout(); plt.savefig(FIG_DIR/'label_distribution_overall.png', dpi=150); plt.show()

    ctab = pd.crosstab(df['lang'], df['label']).sort_index()
    ctab.to_csv(TAB_DIR/'label_by_lang_crosstab.csv')
    ctab.plot(kind='bar', stacked=True, figsize=(8,4))
    plt.title('Label Distribution by Language'); plt.xlabel('Language'); plt.ylabel('Count')
    plt.tight_layout(); plt.savefig(FIG_DIR/'label_by_lang_stacked.png', dpi=150); plt.show()
else:
    print("No usable 'label' column; skipping label plots.")


## 7. Save Summary

In [None]:

summary = {
    'original_shape': tuple(orig_shape),
    'post_drop_duplicates': tuple(df.shape),
    'missing': {k:int(v) for k,v in dict(missing).items()},
    'duplicates_removed': int(dups),
    'lang_counts': df['lang'].value_counts().to_dict()
}
if 'label' in df.columns and df['label'].notna().any():
    summary['label_counts'] = df['label'].value_counts().to_dict()

with open(TAB_DIR/'summary.json', 'w', encoding='utf-8') as f:
    json.dump(summary, f, ensure_ascii=False, indent=2)
print(json.dumps(summary, ensure_ascii=False, indent=2))
