# Wikipedia Music Genres NLP Project

## Overview

This notebook performs an NLP on random articles collected from [Wikipedia](https://en.wikipedia.org/). It explored different feature extraction and clustering methods.

## Imports

In [33]:
import gensim.downloader as model_api
import numpy as np
import os
import pandas as pd
import re
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
import time

ImportError: cannot import name 'triu' from 'scipy.linalg' (/opt/anaconda3/lib/python3.12/site-packages/scipy/linalg/__init__.py)

## Data Collection

In [None]:
url = 'https://en.wikipedia.org/w/api.php'
csv_path = 'data/wiki_articles.csv'

In [None]:
def get_random_titles(n:int) -> np.array:
    titles = []
    while len(titles) < n:
        try:
            params = {
                'action': 'query',
                'list': 'random',
                'rnnamespace': 0,  # Only articles
                'rnlimit': min(50, n - len(titles)),
                'format': 'json'
            }
            response = requests.get(url, params=params)
            data = response.json()
            batch = [item['title'] for item in data['query']['random']]
            titles.extend(batch)
            time.sleep(0.5)
        except Exception as e:
            print('Error:', e)
            time.sleep(1)
    return np.array(titles)

In [None]:
def get_article_text(title:str) -> str:
    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'extracts',
        'explaintext': True,
        'titles': title
    }
    try:
        response = requests.get(url, params=params)
        pages = response.json()['query']['pages']
        page = next(iter(pages.values()))
        return page.get('extract', '')
    except Exception as e:
        print(f'Error fetching {title}: {e}')
        return ''

In [None]:
def remove_unwanted_sections(text):
    unwanted_sections = [
        r'==\s*See also\s*==',
        r'==\s*References\s*==',
        r'==\s*Further reading\s*==',
        r'==\s*External links\s*==',
        r'==\s*Notes\s*==',
        r'==\s*Sources\s*==',
        r'==\s*Bibliography\s*==',
        r'==\s*Footnotes\s*=='
    ]
    
    pattern = re.compile('|'.join(unwanted_sections), re.IGNORECASE)
    match = pattern.search(text)
    
    if match:
        return text[:match.start()].strip()
    else:
        return text.strip()

In [None]:
def full_clean(text):
    core = remove_unwanted_sections(text)
    core = re.sub(r'\n{2,}', '\n', core)
    return core.strip()

In [7]:
def export_list_to_csv(data:list[dict], csv_path:str) -> None:
	df = pd.DataFrame(data)
	if not os.path.isfile(csv_path):
		df.to_csv(csv_path, index=False)
	else:
		df.to_csv(csv_path, index=False, header=False, mode='a')

In [None]:
# Get 10000 random articles from Wikipedia
titles = get_random_titles(10000)
chunks = np.array_split(titles, 100)

In [None]:
# Export data in chunks
for i, chunk in enumerate(chunks):
	print(f'Chunk {i + 1}/{len(chunks)}')
	wiki_list = []
	for title in chunk:
		text = get_article_text(title)
		if text and len(text) > 300:  # Filter out very short pages
			wiki_list.append({
				'title': title,
				'text': full_clean(text)
			})
		time.sleep(0.5) # Sleep for 500 ms to avoid rate-limiting
	export_list_to_csv(wiki_list, csv_path)
	print(f'Added {len(wiki_list)}/{len(chunk)} articles to CSV file.')

## EDA

In [11]:
# Load data
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,title,text
0,Biff Schlitzer,"Victor Joseph ""Biff"" Schlitzer (December 4, 18..."
1,Prabhash Kumar,"Prabhash Kumar is an Indian politician, farmer..."
2,San Carlos Formation,The San Carlos Formation is a geological forma...
3,2023 in Ohio,The following is a list of events of the year ...
4,2009 Iowa special elections,The 2009 Iowa state special elections were hel...


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8595 entries, 0 to 8594
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   8595 non-null   object
 1   text    8595 non-null   object
dtypes: object(2)
memory usage: 134.4+ KB


In [23]:
# Get smallest test
index = df['text'].str.len().idxmin()
df.iloc[index]

title                     List of volcanoes in El Salvador
text     This is a list of active and extinct volcanoes...
Name: 5992, dtype: object

In [24]:
# Get biggest test
index = df['text'].str.len().idxmax()
df.iloc[index]

title                                  Divided differences
text     In mathematics, divided differences is an algo...
Name: 5321, dtype: object

## Data Preprocessing

In [17]:
# Only keep text from article
X = df['text']
X.head()

0    Victor Joseph "Biff" Schlitzer (December 4, 18...
1    Prabhash Kumar is an Indian politician, farmer...
2    The San Carlos Formation is a geological forma...
3    The following is a list of events of the year ...
4    The 2009 Iowa state special elections were hel...
Name: text, dtype: object

In [None]:
# Tokenize texts by removing punctuation and special characters
# and lowercasing everything
replaceDict = dict({
'{':" ", '}':" ", ',':"", '.':" ", '!':" ", '\\':" ", '/':" ", '$':" ", '%':" ",
'^':" ", '?':" ", '\'':" ", '"':" ", '(':" ", ')':" ", '*':" ", '+':" ", '-':" ",
'=':" ", ':':" ", ';':" ", ']':" ", '[':" ", '`':" ", '~':" ",
})

rep = dict((re.escape(k),v) for k, v in replaceDict.items())
pattern = re.compile('|'.join(rep.keys()))

def replacer(text):
    return rep[re.escape(text.group(0))]

words = X.str.replace(pattern, replacer, regex=True).str.lower().str.split()
words = pd.DataFrame(words.tolist())
words

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15404,15405,15406,15407,15408,15409,15410,15411,15412,15413
0,victor,joseph,biff,schlitzer,december,4,1884,–,january,4,...,,,,,,,,,,
1,prabhash,kumar,is,an,indian,politician,farmer,and,a,member,...,,,,,,,,,,
2,the,san,carlos,formation,is,a,geological,formation,in,west,...,,,,,,,,,,
3,the,following,is,a,list,of,events,of,the,year,...,,,,,,,,,,
4,the,2009,iowa,state,special,elections,were,held,throughout,2009,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8590,uroš,lajovic,is,a,slovenian,conductor,and,professor,he,has,...,,,,,,,,,,
8591,vegetable,chips,also,referred,to,as,veggie,chips,are,chips,...,,,,,,,,,,
8592,josé,de,aquino,pereira,april,22,1920,–,november,17,...,,,,,,,,,,
8593,november,2005,bangladesh,court,bombing,was,a,simultaneous,suicide,bombing,...,,,,,,,,,,


### Feature extraction

#### TF-IDF Vectorizer

In [31]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(X)
tfidf_matrix.todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.02091569, 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [32]:
tfidf_matrix.shape

(8595, 163381)

#### GloVe Word Embedding

#### Sentence Embeddings from Hugging Face

In [None]:
# TF-IDF, BERT embeddings
# Word embeddings like Word2Vec, Glove

In [None]:
# Or use HF feature extraction model that does all those steps

## Unsupervised Learning

### K-Means

### DBSCAN

### Latent Dirichlet Allocation (LDA)?

## Evaluation and Interpretation

### Visualize Clusters

In [None]:
#  t-SNE, UMAP, PCA

### Analyse Clusters

## End