# Wikipedia Music Genres NLP Project

## Overview

This notebook performs an NLP on random articles collected from [Wikipedia](https://en.wikipedia.org/).

## Imports

In [1]:
import numpy as np
import os
import pandas as pd
import re
import requests
import time

## Data Collection

In [2]:
url = "https://en.wikipedia.org/w/api.php"
csv_path = "data/wiki_articles.csv"

In [3]:
def get_random_titles(n:int) -> np.array:
    titles = []
    while len(titles) < n:
        try:
            params = {
                "action": "query",
                "list": "random",
                "rnnamespace": 0,  # Only articles
                "rnlimit": min(50, n - len(titles)),
                "format": "json"
            }
            response = requests.get(url, params=params)
            data = response.json()
            batch = [item["title"] for item in data["query"]["random"]]
            titles.extend(batch)
            time.sleep(0.5)
        except Exception as e:
            print("Error:", e)
            time.sleep(1)
    return np.array(titles)

In [4]:
def get_article_text(title:str) -> str:
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "explaintext": True,
        "titles": title
    }
    try:
        response = requests.get(url, params=params)
        pages = response.json()["query"]["pages"]
        page = next(iter(pages.values()))
        return page.get("extract", "")
    except Exception as e:
        print(f"Error fetching {title}: {e}")
        return ""

In [5]:
def remove_unwanted_sections(text):
    unwanted_sections = [
        r"==\s*See also\s*==",
        r"==\s*References\s*==",
        r"==\s*Further reading\s*==",
        r"==\s*External links\s*==",
        r"==\s*Notes\s*==",
        r"==\s*Sources\s*==",
        r"==\s*Bibliography\s*==",
        r"==\s*Footnotes\s*=="
    ]
    
    pattern = re.compile("|".join(unwanted_sections), re.IGNORECASE)
    match = pattern.search(text)
    
    if match:
        return text[:match.start()].strip()
    else:
        return text.strip()

In [6]:
def full_clean(text):
    core = remove_unwanted_sections(text)
    core = re.sub(r"\n{2,}", "\n", core)
    return core.strip()

In [7]:
def export_list_to_csv(data:list[dict], csv_path:str) -> None:
	df = pd.DataFrame(data)
	if not os.path.isfile(csv_path):
		df.to_csv(csv_path, index=False)
	else:
		df.to_csv(csv_path, index=False, header=False, mode='a')

In [None]:
# Get 10000 random articles from Wikipedia
titles = get_random_titles(10000)
chunks = np.array_split(titles, 100)

In [None]:
# Export data in chunks
for i, chunk in enumerate(chunks):
	print(f"Chunk {i + 1}/{len(chunks)}")
	wiki_list = []
	for title in chunk:
		text = get_article_text(title)
		if text and len(text) > 300:  # Filter out very short pages
			wiki_list.append({
				"title": title,
				"text": full_clean(text)
			})
		time.sleep(0.5) # Sleep for 500 ms to avoid rate-limiting
	export_list_to_csv(wiki_list, csv_path)
	print(f"Added {len(wiki_list)}/{len(chunk)} articles to CSV file.")

## Data Preprocessing

In [9]:
# Load data
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,title,text
0,Biff Schlitzer,"Victor Joseph ""Biff"" Schlitzer (December 4, 18..."
1,Prabhash Kumar,"Prabhash Kumar is an Indian politician, farmer..."
2,San Carlos Formation,The San Carlos Formation is a geological forma...
3,2023 in Ohio,The following is a list of events of the year ...
4,2009 Iowa special elections,The 2009 Iowa state special elections were hel...


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8595 entries, 0 to 8594
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   8595 non-null   object
 1   text    8595 non-null   object
dtypes: object(2)
memory usage: 134.4+ KB


### Tokenize

### Remove stopwords

### Lemmatize or apply stemming

## Feature Extraction

In [None]:
# TF-IDF, BERT embeddings
# Word embeddings like Word2Vec, Glove

In [None]:
# Or use HF feature extraction model that does all those steps

## Unsupervised Learning

### K-Means

### DBSCAN

### Latent Dirichlet Allocation (LDA)?

## Evaluation and Interpretation

### Visualize Clusters

In [None]:
#  t-SNE, UMAP, PCA

### Analyse Clusters

## End