## Import Data and Library

In [1]:
import os
os.chdir(r'C:\Users\nadda\Desktop\KU\01204314-65-Statistics-for-Computer-Engineering-Applications\final_project\part2.1')
os.getcwd()

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')  # optional แต่ช่วยให้ lemmatizer ทำงานดีขึ้น

import re
import numpy as np
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

from gensim.models import Word2Vec


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nadda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nadda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\nadda\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nadda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nadda\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
file = open("africa.txt", "r", encoding='utf-8')
africa = file.read()
file.close()

file = open("asia.txt", "r", encoding='utf-8')
asia = file.read()
file.close()

file = open("europe.txt", "r", encoding='utf-8')
europe = file.read()
file.close()

file = open("latin_america.txt", "r", encoding='utf-8')
latin_america = file.read()
file.close()

## Text Processing

In [3]:
# Tokenize
africa_token = word_tokenize(africa)
asia_token = word_tokenize(asia)
europe_token = word_tokenize(europe)
latin_america_token = word_tokenize(latin_america)

In [4]:
# Remove punctuation
africa_token = [re.sub(r'[^\w\s]', '', token) for token in africa_token if re.sub(r'[^\w\s]', '', token)]
asia_token = [re.sub(r'[^\w\s]', '', token) for token in asia_token if re.sub(r'[^\w\s]', '', token)]
europe_token = [re.sub(r'[^\w\s]', '', token) for token in europe_token if re.sub(r'[^\w\s]', '', token)]
latin_america_token = [re.sub(r'[^\w\s]', '', token) for token in latin_america_token if re.sub(r'[^\w\s]', '', token)]

In [5]:
# Remove stop words
stop_words = set(stopwords.words('english'))
africa_token = [token for token in africa_token if token.lower() not in stop_words]
asia_token = [token for token in asia_token if token.lower() not in stop_words]
europe_token = [token for token in europe_token if token.lower() not in stop_words]
latin_america_token = [token for token in latin_america_token if token.lower() not in stop_words]

In [6]:
# POS tagging
africa_pos = pos_tag(africa_token)
asia_pos = pos_tag(asia_token)
europe_pos = pos_tag(europe_token)
latin_america_pos = pos_tag(latin_america_token)

In [7]:
# Lemmatization
lemmatizer = WordNetLemmatizer()

# แปลง POS tag จาก nltk → wordnet format
def convert_pos(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default เป็นคำนาม
    
def lemmatize_and_lowercase(word_pos_list):
    result = []
    for word, pos in word_pos_list:
        wn_pos = convert_pos(pos)
        lemma = lemmatizer.lemmatize(word, wn_pos)
        if wn_pos != wordnet.NOUN:
            lemma = lemma.lower()
        result.append(lemma)
    return result

africa_lemmatized = lemmatize_and_lowercase(africa_pos)
asia_lemmatized = lemmatize_and_lowercase(asia_pos)
europe_lemmatized = lemmatize_and_lowercase(europe_pos)
latin_america_lemmatized = lemmatize_and_lowercase(latin_america_pos)

## Text Analysis

In [8]:
from gensim.models import Word2Vec

region_names = ['africa', 'asia', 'europe', 'latin_america']
region_tokens = [africa_lemmatized, asia_lemmatized, europe_lemmatized, latin_america_lemmatized]

region_models = {}

for name, tokens in zip(region_names, region_tokens):
    model = Word2Vec(
        sentences=[tokens],     # ต้องเป็น list of list
        vector_size=100,
        window=5,
        min_count=1,
        sg=1,
        epochs=100,
        seed=42
    )
    region_models[name] = model

In [9]:
keywords = ['gender', 'equality']
related_words = {}

for name, model in region_models.items():
    related_words[name] = {}
    for key in keywords:
        if key in model.wv:
            related = model.wv.most_similar(key, topn=30)
            related_words[name][key] = [word for word, score in related]
        else:
            related_words[name][key] = []

In [19]:
# รวมคำที่เกี่ยวข้องทุกคำในแต่ละ region
for region in region_names:
    print(f"🔹 {region.upper()} — Related terms")
    for key in keywords:
        print(f"  {key}: {related_words[region][key]}")
    print()

🔹 AFRICA — Related terms
  gender: ['equality', 'towards', 'continue', 'economic', 'progress', 'Reduce', 'sexsegregation', 'measure', 'gaps', 'empower', 'labour', 'take', 'market', 'elimination', 'discrimination', 'Data', 'make', 'adopt', 'priority', 'conduct', 'member', 'lens', 'encourage', 'several', 'suggest', 'hidden', 'association', 'Given', 'code', 'law']
  equality: ['continue', 'gender', 'towards', 'progress', 'economic', 'gaps', 'empower', 'Reduce', 'Data', 'sexsegregation', 'take', 'labour', 'market', 'measure', 'several', 'national', 'elimination', 'discrimination', 'priority', 'suggest', 'adopt', 'hidden', 'law', 'institute', 'pose', 'report', 'bias', 'member', 'Given', 'make']

🔹 ASIA — Related terms
  gender: ['equality', 'commitment', 'across', 'awareness', 'expansion', 'mainstreaming', 'sector', 'entity', '19', 'genderresponsive', 'budgeting', 'machineries', 'c', 'governance', 'achieve', 'concerning', 'institutional', 'adoption', 'term', 'promote', 'violence', 'girls20'

In [25]:
from collections import Counter

for key in keywords:
    print(f"==== 🔍 Comparing related words for: '{key}' ====")
    # รวมทุกคำของคำหลัก key จากทุก region
    all_words = sum([related_words[r][key] for r in region_names], [])
    word_counts = Counter(all_words)
    print("✅ Common across regions:", [w for w, c in word_counts.items() if c > 1])
    print()

==== 🔍 Comparing related words for: 'gender' ====
✅ Common across regions: ['equality', 'towards', 'market', 'encourage', 'Given', 'law', 'achieve']

==== 🔍 Comparing related words for: 'equality' ====
✅ Common across regions: ['gender', 'towards', 'national', 'law', 'Given', 'make', 'promote', 'achieve']



In [30]:
from collections import defaultdict

for key in keywords:
    print(f"==== 🔍 Unique related words for: '{key}' ====")

    # เตรียม map: word -> regions ที่มีคำนั้น
    word_to_regions = defaultdict(list)
    for region in region_names:
        for word in related_words[region][key]:
            word_to_regions[word].append(region)

    # ดึงเฉพาะคำที่อยู่ใน region เดียวเท่านั้น
    for region in region_names:
        unique_words = [
            word for word in related_words[region][key]
            if word_to_regions[word] == [region]
        ]
        print(f"\n🟦 {region.upper()} ({len(unique_words)} unique terms)")
        print(unique_words)
    print("________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________\n")

==== 🔍 Unique related words for: 'gender' ====

🟦 AFRICA (24 unique terms)
['continue', 'economic', 'progress', 'Reduce', 'sexsegregation', 'measure', 'gaps', 'empower', 'labour', 'take', 'elimination', 'discrimination', 'Data', 'make', 'adopt', 'priority', 'conduct', 'member', 'lens', 'several', 'suggest', 'hidden', 'association', 'code']

🟦 ASIA (27 unique terms)
['commitment', 'across', 'awareness', 'expansion', 'mainstreaming', 'sector', 'entity', '19', 'genderresponsive', 'budgeting', 'machineries', 'c', 'governance', 'concerning', 'institutional', 'adoption', 'term', 'promote', 'violence', 'girls20', 'framework', 'government', 'strengthen', 'equalityrelated', 'enforcement', 'mandate', 'legislation']

🟦 EUROPE (26 unique terms)
['gap', 'still', 'show', 'level', 'rate', 'basis', 'may', 'full', 'persist', 'equivalent', 'parttime', 'men', 'woman', 'Incentives', 'Full', 'employment', 'even', 'underestimate', 'equivalence', 'increase', 'fact', 'reach', 'without', 'comparison', 'resort'