
# CENG 442 — Azerbaijani Text Preprocessing & Word Embeddings (Domain-Aware)

This notebook builds a complete, reproducible pipeline for the assignment:

**Outputs**
- Five 2‑column Excel files: `cleaned_text`, `sentiment_value` (0.0 / 0.5 / 1.0)
- One combined, domain‑tagged corpus: `corpus_all.txt` (one sentence per line)
- Two embedding models trained on the combined corpus: `embeddings/word2vec.model`, `embeddings/fasttext.model`
- Simple comparison of Word2Vec vs FastText (coverage, synonym/antonym similarity, nearest neighbors)


In [20]:

# -*- coding: utf-8 -*-
import re, html, unicodedata
from pathlib import Path
import pandas as pd

# Where to read/write
DATA_DIR = Path('Excels')
OUT_DIR = Path('outputs')
EMB_DIR = Path('embeddings')
EMB_DIR.mkdir(exist_ok=True)

print('Working directory:', Path.cwd())


Working directory: c:\Users\merte\OneDrive\Masaüstü\CENG442 nlp\nlp-hw1-main


## Normalization utilities (Azerbaijani-aware)

In [21]:

# Azerbaijani-aware lowercase
def lower_az(s: str) -> str:
    if not isinstance(s, str): return ""
    s = unicodedata.normalize("NFC", s)
    s = s.replace("I", "ı").replace("İ", "i")
    s = s.lower().replace("i","i")
    return s

HTML_TAG_RE = re.compile(r"<[^>]+>")
URL_RE = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
EMAIL_RE = re.compile(r"\b[\w\.-]+@[\w\.-]+\.\w+\b", re.IGNORECASE)
PHONE_RE = re.compile(r"\+?\d[\d\-\s\(\)]{6,}\d")
USER_RE = re.compile(r"@\w+")
MULTI_PUNCT = re.compile(r"([!?.,;:])\1{1,}")
MULTI_SPACE = re.compile(r"\s+")
REPEAT_CHARS = re.compile(r"(.)\1{2,}", flags=re.UNICODE)

TOKEN_RE = re.compile(
    r"[A-Za-zƏəĞğIıİiÖöÜüÇçŞşXxQq]+(?:'[A-Za-zƏəĞğIıİiÖöÜüÇçŞşXxQq]+)?"
    r"|<NUM>|URL|EMAIL|PHONE|USER|EMO_(?:POS|NEG)"
)

# Tiny emoji map
EMO_MAP = {"🙂":"EMO_POS","😀":"EMO_POS","😍":"EMO_POS","😊":"EMO_POS","👍":"EMO_POS",
           "☹":"EMO_NEG","🙁":"EMO_NEG","😠":"EMO_NEG","😡":"EMO_NEG","👎":"EMO_NEG"}

# Light slang / deasciify map
SLANG_MAP = {"slm":"salam","tmm":"tamam","sagol":"sağol","cox":"çox","yaxsi":"yaxşı"}
NEGATORS  = {"yox","deyil","heç","qətiyyən","yoxdur"}

def normalize_text_az(s: str, numbers_to_token=True, keep_sentence_punct=False) -> str:
    if not isinstance(s, str): return ""
    # emoji map first
    for emo, tag in EMO_MAP.items():
        s = s.replace(emo, f" {tag} ")
    s = html.unescape(s)
    s = HTML_TAG_RE.sub(" ", s)
    s = URL_RE.sub(" URL ", s)
    s = EMAIL_RE.sub(" EMAIL ", s)
    s = PHONE_RE.sub(" PHONE ", s)
    # Keep text, split camelCase
    s = re.sub(r"#([A-Za-z0-9_]+)", lambda m: " " + re.sub('([a-z])([A-Z])', r'\1 \2', m.group(1)) + " ",s)
    s = USER_RE.sub(" USER ", s)
    s = lower_az(s)
    s = MULTI_PUNCT.sub(r"\1", s)
    if numbers_to_token:
        s = re.sub(r"\d+", " <NUM> ", s)
    if keep_sentence_punct:
        s = re.sub(r"[^\w\s<>'əğıöşüçƏĞIİÖŞÜÇxqXQ.!?]", " ", s)
    else:
        s = re.sub(r"[^\w\s<>'əğıöşüçƏĞIİÖŞÜÇxqXQ]", " ", s)
    s = MULTI_SPACE.sub(" ", s).strip()

    toks = TOKEN_RE.findall(s)
    norm = []
    mark_neg = 0
    for t in toks:
        t = REPEAT_CHARS.sub(r"\1\1", t) # limit repeated chars to 2
        t = SLANG_MAP.get(t, t) # slang map
        if t in NEGATORS:
            norm.append(t); mark_neg = 3; continue
        if mark_neg > 0 and t not in {"URL","EMAIL","PHONE","USER"}:
            norm.append(t + "_NEG"); mark_neg -= 1
        else:
            norm.append(t)
    norm = [t for t in norm if not (len(t) == 1 and t not in {"o","e"})]
    return " ".join(norm).strip()


## Domain awareness (news / social / reviews / general)

In [22]:

NEWS_HINTS = re.compile(r"\b(apa|trend|azertac|reuters|bloomberg|dha|aa)\b", re.I)
SOCIAL_HINTS = re.compile(r"\b(rt)\b|@|#|(?:😂|😍|😊|👍|👎|😡|🙂)")
REV_HINTS = re.compile(r"\b(azn|manat|qiymət|aldım|ulduz|çox yaxşı|çox pis)\b", re.I)

PRICE_RE = re.compile(r"\b\d+\s*(azn|manat)\b", re.I)
STARS_RE = re.compile(r"\b([1-5])\s*ulduz\b", re.I)
POS_RATE = re.compile(r"\bçox yaxşı\b")
NEG_RATE = re.compile(r"\bçox pis\b")

def detect_domain(text: str) -> str:
    s = (text or "").lower()
    if NEWS_HINTS.search(s): return "news"
    if SOCIAL_HINTS.search(s): return "social"
    if REV_HINTS.search(s): return "reviews"
    return "general"

def domain_specific_normalize(cleaned: str, domain: str) -> str:
    if domain == "reviews":
        s = PRICE_RE.sub(" <PRICE> ", cleaned)
        s = STARS_RE.sub(lambda m: f" <STARS_{m.group(1)}> ", s)
        s = POS_RATE.sub(" <RATING_POS> ", s)
        s = NEG_RATE.sub(" <RATING_NEG> ", s)
        return " ".join(s.split())
    return cleaned

def add_domain_tag(line: str, domain: str) -> str:
    return f"dom{domain} " + line


## Processing functions (two-column Excel creation)

In [23]:

def map_sentiment_value(v, scheme: str):
    if scheme == "binary":
        try:
            return 1.0 if int(v) == 1 else 0.0
        except Exception:
            return None
    s = str(v).strip().lower()
    if s in {"pos", "positive", "1", "müsbət", "good", "pozitiv"}: return 1.0
    if s in {"neu", "neutral", "2", "neytral"}: return 0.5
    if s in {"neg", "negative", "0", "mənfi", "bad", "neqativ"}: return 0.0
    return None


def process_file(in_path, text_col, label_col, scheme, out_two_col_path, remove_stopwords=False):
    df = pd.read_excel(in_path)
    for c in ["Unnamed: 0", "index"]:
        if c in df.columns:
            df = df.drop(columns=[c])
    assert text_col in df.columns and label_col in df.columns, f"Missing columns in {in_path}"

    # cleaning
    df = df.dropna(subset=[text_col])
    df = df[df[text_col].astype(str).str.strip().str.len() > 0]
    df = df.drop_duplicates(subset=[text_col])

    df["cleaned_text"] = df[text_col].astype(str).apply(lambda s: normalize_text_az(s))
    df["__domain__"]   = df[text_col].astype(str).apply(detect_domain)
    df["cleaned_text"] = df.apply(lambda r:
                                  domain_specific_normalize(r["cleaned_text"], r["__domain__"]), axis=1)

    df = df.dropna(subset=["cleaned_text"])
    df = df[df["cleaned_text"].astype(str).str.strip() != ""]

    if remove_stopwords:
        sw = set(["və","ilə","amma","ancaq","lakin","ya","həm","ki","bu","bir","o","biz","siz","mən","sən",
                  "orada","burada","bütün","hər","artıq","çox","az","ən","də","da","üçün"])
        for keep in ["deyil","yox","heç","qətiyyən","yoxdur"]:
            sw.discard(keep)
        df["cleaned_text"] = df["cleaned_text"].apply(
            lambda s: " ".join([t for t in s.split() if t not in sw])
        )

    # sentiment mapping
    df["sentiment_value"] = df[label_col].apply(lambda v: map_sentiment_value(v, scheme))
    df = df.dropna(subset=["sentiment_value"])
    df["sentiment_value"] = df["sentiment_value"].astype(float)

    out_df = df[["cleaned_text", "sentiment_value"]].reset_index(drop=True)
    Path(out_two_col_path).parent.mkdir(parents=True, exist_ok=True)
    out_df.to_excel(out_two_col_path, index=False)
    print(f"Saved: {out_two_col_path} (rows={len(out_df)})")



## Configure your dataset files here

In [24]:

CFG = [
    ("labeled-sentiment.xlsx", "text", "sentiment", "tri"),
    ("test__1_.xlsx", "text", "label", "binary"),
    ("train__3_.xlsx", "text", "label", "binary"),
    ("train-00000-of-00001.xlsx", "text", "labels", "tri"),
    ("merged_dataset_CSV__1_.xlsx", "text", "labels", "binary"),
]

# Run the processor to generate the required 2-column Excel outputs.
for fname, tcol, lcol, scheme in CFG:
    in_path = DATA_DIR / fname
    out = OUT_DIR / f"{Path(fname).stem}_output.xlsx"
    if not in_path.exists():
        print(f"[WARN] File not found: {in_path} — skip if not used in your repo")
        continue
    process_file(in_path, tcol, lcol, scheme, out, remove_stopwords=False)

print()

# Report on cleaning stats
for fname, tcol, lcol, scheme in CFG:
    p = DATA_DIR / fname
    if not p.exists():
        print(f"Missing: {p}")
        continue
    df = pd.read_excel(p)
    orig = len(df)
    non_null = df[df[tcol].notna() & df[tcol].astype(str).str.strip().str.len()>0]
    after_dedup = non_null.drop_duplicates(subset=[tcol])
    removed_empty = orig - len(non_null)
    removed_dups  = len(non_null) - len(after_dedup)
    print(f"{p.name}: orig={orig}, removed_empty={removed_empty}, removed_dups={removed_dups}, final={len(after_dedup)}")


Saved: outputs\labeled-sentiment_output.xlsx (rows=2955)
Saved: outputs\test__1__output.xlsx (rows=4193)
Saved: outputs\train__3__output.xlsx (rows=19536)
Saved: outputs\train-00000-of-00001_output.xlsx (rows=41705)
Saved: outputs\merged_dataset_CSV__1__output.xlsx (rows=55662)

labeled-sentiment.xlsx: orig=2958, removed_empty=1450, removed_dups=2, final=1506
test__1_.xlsx: orig=4200, removed_empty=1991, removed_dups=1, final=2208
train__3_.xlsx: orig=19600, removed_empty=9669, removed_dups=25, final=9906
train-00000-of-00001.xlsx: orig=42000, removed_empty=20736, removed_dups=135, final=21129
merged_dataset_CSV__1_.xlsx: orig=55673, removed_empty=27448, removed_dups=10, final=28215


## Build `corpus_all.txt` (domain-tagged, one sentence per line)

In [25]:

def build_corpus_txt(input_files, text_cols, out_txt="corpus_all.txt"):
    lines = []
    for (f, text_col) in zip(input_files, text_cols):
        path = DATA_DIR / f
        if not path.exists():
            print(f"[WARN] Missing for corpus: {path}")
            continue
        df = pd.read_excel(path)
        if text_col not in df.columns:
            print(f"[WARN] Column '{text_col}' missing in {path.name}")
            continue
        for raw in df[text_col].dropna().astype(str):
            dom = detect_domain(raw)
            s = normalize_text_az(raw, keep_sentence_punct=True)
            parts = re.split(r"[.!?]+", s)
            for p in parts:
                p = p.strip()
                if not p: continue
                p = re.sub(r"[^\w\səğıöşüçƏĞIİÖŞÜÇxqXQ]", " ", p) # remove punctuation
                p = " ".join(p.split()).lower()
                if p:
                    lines.append(f"dom{dom} " + p)
    with open(out_txt, "w", encoding="utf-8") as w:
        for ln in lines:
            w.write(ln + "\n")
    print(f"Wrote {out_txt} with {len(lines)} lines")

build_corpus_txt([c[0] for c in CFG], [c[1] for c in CFG], out_txt="corpus_all.txt")


Wrote corpus_all.txt with 124353 lines


## Train Word2Vec & FastText embeddings

In [26]:

from gensim.models import Word2Vec, FastText

two_col_files = [
    f"{Path(c[0]).stem}_output.xlsx" for c in CFG
]

sentences = []
for f in two_col_files:
    p = OUT_DIR / f
    if not p.exists():
        print(f"[WARN] 2-col file missing for embeddings: {p}")
        continue
    df = pd.read_excel(p, usecols=["cleaned_text"])
    sentences.extend(df["cleaned_text"].astype(str).str.split().tolist())

print(f"Training on {len(sentences)} sentences...")
EMB_DIR.mkdir(exist_ok=True)

w2v = Word2Vec(sentences=sentences, vector_size=300, window=5, min_count=3, sg=1, negative=10, epochs=10)
w2v.save(str(EMB_DIR / "word2vec.model"))
ft = FastText(sentences=sentences, vector_size=300, window=5, min_count=3, sg=1, min_n=3, max_n=6, epochs=10)
ft.save(str(EMB_DIR / "fasttext.model"))
print("Saved models to", EMB_DIR)


Training on 124051 sentences...
Saved models to embeddings


## Compare Word2Vec vs FastText

In [27]:

from gensim.models import Word2Vec, FastText

w2v = Word2Vec.load(str(EMB_DIR / "word2vec.model"))
ft = FastText.load(str(EMB_DIR / "fasttext.model"))

seed_words = ["yaxşı","pis","çox","bahalı","ucuz","mükəmməl","dəhşət","<PRICE>","<RATING_POS>"]
syn_pairs = [("yaxşı","əla"), ("bahalı","qiymətli"), ("ucuz","sərfəli")]
ant_pairs = [("yaxşı","pis"), ("bahalı","ucuz")]

def lexical_coverage(model, tokens):
    vocab = model.wv.key_to_index
    return sum(1 for t in tokens if t in vocab) / max(1,len(tokens))

def read_tokens(f):
    df = pd.read_excel(f, usecols=["cleaned_text"])
    return [t for row in df["cleaned_text"].astype(str) for t in row.split()]

print("Lexical coverage (per dataset)")
for f in two_col_files:
    p = OUT_DIR / f
    if not p.exists():
        print(f"[WARN] Skipping coverage for missing: {p}")
        continue
    toks = read_tokens(p)
    cov_w2v = lexical_coverage(w2v, toks)
    cov_ftv = lexical_coverage(ft, toks)  # FT vocab-based; FT can still embed OOV via subwords at inference
    print(f"{p.name}: W2V={cov_w2v:.3f}, FT(vocab)={cov_ftv:.3f}")

def pair_sim(model, pairs):
    vals = []
    for a,b in pairs:
        try:
            vals.append(model.wv.similarity(a,b))
        except KeyError:
            pass
    return sum(vals)/len(vals) if vals else float('nan')

syn_w2v = pair_sim(w2v, syn_pairs)
syn_ft = pair_sim(ft,  syn_pairs)
ant_w2v = pair_sim(w2v, ant_pairs)
ant_ft = pair_sim(ft,  ant_pairs)

print("\nSimilarity (higher better for synonyms; lower better for antonyms)")
print(f"Synonyms: W2V={syn_w2v:.3f}, FT={syn_ft:.3f}")
print(f"Antonyms: W2V={ant_w2v:.3f}, FT={ant_ft:.3f}")
print(f"Separation (Syn - Ant): W2V={(syn_w2v - ant_w2v):.3f}, FT={(syn_ft - ant_ft):.3f}")

def neighbors(model, word, k=5):
    try: return [w for w,_ in model.wv.most_similar(word, topn=k)]
    except KeyError: return []

print("\nNearest neighbors (qualitative)")
for w in seed_words:
    print(f"W2V NN for '{w}':", neighbors(w2v, w))
    print(f"FT NN for '{w}':", neighbors(ft,  w))

Lexical coverage (per dataset)
labeled-sentiment_output.xlsx: W2V=0.932, FT(vocab)=0.932
test__1__output.xlsx: W2V=0.987, FT(vocab)=0.987
train__3__output.xlsx: W2V=0.990, FT(vocab)=0.990
train-00000-of-00001_output.xlsx: W2V=0.943, FT(vocab)=0.943
merged_dataset_CSV__1__output.xlsx: W2V=0.949, FT(vocab)=0.949

Similarity (higher better for synonyms; lower better for antonyms)
Synonyms: W2V=0.369, FT=0.444
Antonyms: W2V=0.347, FT=0.424
Separation (Syn - Ant): W2V=0.022, FT=0.020

Nearest neighbors (qualitative)
W2V NN for 'yaxşı': ['<RATING_POS>', 'yaxshi', 'iyi', 'yaxwi', 'awsome']
FT NN for 'yaxşı': ['yaxşıı', 'yaxşıkı', 'yaxşıca', 'yaxş', 'yaxşıya']
W2V NN for 'pis': ['vərdişlərə', '<RATING_NEG>', 'lire', 'günd', 'sürükliyir']
FT NN for 'pis': ['piis', 'pi', 'pisə', 'pixlr', 'pisleşdi']
W2V NN for 'çox': ['çoox', 'çöx', 'bəyənilsin', 'əladir', 'işçilərindən']
FT NN for 'çox': ['çoxçox', 'çoxx', 'çoxh', 'ço', 'çoh']
W2V NN for 'bahalı': ['portretlerinə', 'radiusda', 'metallarla', 'ya

## Mini-challenges utilities

In [35]:
def count_deasciify(tokens):
    changed = 0
    out = []
    for t in tokens:
        tt = SLANG_MAP.get(t, t)
        if tt != t: changed += 1
        out.append(tt)
    return changed, out

def simple_hashtag_split(s: str):
    return re.sub(r"#([A-Za-z0-9_]+)", lambda m: re.sub('([a-z])([A-Z])', r'\1 \2', m.group(1)), s)

print(count_deasciify('cox yaxsi slm tmm sagol cox'.split()))
print()
print(simple_hashtag_split('#GoodMorningEveryone'))
print()
print(normalize_text_az("Coxxx gozel kino idi 👍👍 #SuperFilm http://example.com"))

(6, ['çox', 'yaxşı', 'salam', 'tamam', 'sağol', 'çox'])

Good Morning Everyone

coxx gozel kino idi emo pos emo pos super film url
