In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import re
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt to /Users/natthawit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/natthawit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/natthawit/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Tokenized sentences should not contain stopwords

In [6]:
stop_words = set(stopwords.words('english'))

# Define a function to get random wikipedia link

In [1]:
def get_random_url():
    random_url = "https://en.wikipedia.org/wiki/Special:Random"
    response = requests.get(random_url)
    return response.url
    

# generate 200 random links

In [8]:
urls = [get_random_url() for i in range(200)]

# Scrape the contents from all those links

In [9]:
def scrape_wikipedia(url):
    # Send a request to the URL
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None

    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the title of the page
    title = soup.find(id='firstHeading').text
    texts = []
    # Extract the content of the page
    contents = soup.find_all('div', class_='mw-parser-output')
    for c in contents:
        for found_content in c.find_all(['p', 'h2', 'h3', 'h4', 'ul', 'ol']):  
            texts.append(found_content.text)
    return title, texts 

In [10]:
all_cleaned_texts = []

for url in urls:
    title, paragraphs = scrape_wikipedia(url)
    all_cleaned_texts.append({'title': title, 'url': url, 'content': paragraphs})

    

In [11]:
all_cleaned_texts[0]

{'title': 'The Quick and the Dead (1963 film)',
 'url': 'https://en.wikipedia.org/wiki/The_Quick_and_the_Dead_(1963_film)',
 'content': ['1963\xa0(1963)',
  'The Quick and the Dead is a 1963 war film directed by Robert Totten, set in Nazi-occupied Europe during World War II.[1][2][3][4]\n',
  'Plot',
  'A group of American soldiers and Italian partisans during World War II join forces in northern Italy against the Germans.\n',
  'Cast',
  "Victor French as Milo Riley\nMajel Barrett as Teresa\nLouis Massad as Donatelli\nSandy Donigan as Maria\nJames Almanzar as Giorgio\nLarry Mann as Parker\nJon Cedar as Lt. Rogers\nJoe Folino as American Soldier\nGerald Ervin as  American Soldier\nJoseph Locastro as Giovanni\nWilliam Kirschner as Dr. Romano\nFrank D'Agostino as Priest\nStuart Nisbet as Nazi Officer\nTed French as Old Man\nJack Crawford as American Officer\nRobert Harker as German Officer",
  'References',
  'Citations',
  '\n^ The Quick and the Dead. AllMovie. 1963. {{cite book}}: |web

# Clean unneccessary texts ( new line, reference number, extra spaces, etc)

In [12]:
def clean_text(text):
    text = text.replace('\n', ' ')
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r"'s", '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text



for obj in all_cleaned_texts:
    title, url, contents_lst = obj.values()
    big_string = " ".join(contents_lst).strip()
    obj['joined_text'] = clean_text(big_string)

# Tokenize the text using nltk.tokenize (Standard lib for tokenize English word)

In [None]:
def tokenize(text):
    tokens =  word_tokenize(text)
    return [word.lower() for word in tokens if word.lower() not in stop_words and re.match(r'^[a-zA-Z]+$', word) and len(word) > 2]

texts = [obj['joined_text'] for obj in all_cleaned_texts]
tokenized_texts = [" | ".join(tokenize(text)) for text in texts]
titles = [obj['title'] for obj in all_cleaned_texts]


pd.DataFrame([(i[0], i[1], i[2]) for i in zip(texts, tokenized_texts, titles)])


# Make TfidVectorizer for all those tokenized texts to see the text frequency in each document

In [14]:
vectorizer = TfidfVectorizer(tokenizer=tokenize)
tfidf_matrix= vectorizer.fit_transform(texts)
print(tfidf_matrix.shape)
feature_names = vectorizer.get_feature_names_out()
df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
df.head()




(200, 27985)


Unnamed: 0,aab,aadaab,aadam,aadmi,aadt,aakhar,aalejivabhai,aali,aalok,aamaaloarkaan,...,zulu,zum,zumar,zumberge,zundapp,zur,zurr,zuyevo,zvezda,zyrin
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Define a number of topics for the model to try figuring out. ( this is the random choice, the more topics, the less generalized)

In [15]:
TOPIC_NUMBER = 5
PREDICT_THRESHOLD = 0.001

# Use Singular Value Decomposition to calculate the relation between all these documents

In [16]:
svd_model = TruncatedSVD(n_components=TOPIC_NUMBER, algorithm='randomized', n_iter=5000)
lsa = svd_model.fit_transform(tfidf_matrix)

In [17]:
lsa.shape

(200, 5)

# Dataframe that shows the amount of confidence each document belongs to each topic

In [23]:
documents_to_topics = pd.DataFrame(lsa, columns=[f"topic_{i}" for i in range(TOPIC_NUMBER)])
documents_to_topics = documents_to_topics.assign(text=texts)
documents_to_topics.head()
# Assuming all_cleaned_texts is a list of dictionaries with 'joined_text' as one of the keys

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,text
0,0.13491,-0.073099,0.005402,-0.054377,-0.021944,The Quick and the Dead is a war film direct...
1,0.128489,-0.075861,-0.0709,0.242133,0.035059,During the – Portuguese football season FC Por...
2,0.101753,-0.087392,0.415259,0.027033,-0.039876,Neoparaphytoseius sooretamus is a species of m...
3,0.134862,-0.018495,-0.020146,-0.130587,0.018018,Sengundram known in English as Red Hills is a ...
4,0.069815,-0.050815,0.230792,0.000797,-0.022529,Eleutherodactylus brevipalmatus Eleutherodacty...


In [24]:
# for each feature words, what are the components of each topic
components = svd_model.components_
print(components.shape)

(5, 27985)


# Assign the topic to the word ( the highest score is the main topic, the lesser one can be thought of as sub topics)

In [25]:
encoding_matrix = pd.DataFrame(components, columns=feature_names, index=[f"topic_{i}" for i in range(TOPIC_NUMBER)])
encoding_matrix.head()

Unnamed: 0,aab,aadaab,aadam,aadmi,aadt,aakhar,aalejivabhai,aali,aalok,aamaaloarkaan,...,zulu,zum,zumar,zumberge,zundapp,zur,zurr,zuyevo,zvezda,zyrin
topic_0,0.00059,0.000546,0.000452,0.000273,0.001113,0.000273,0.000273,0.000273,0.000986,0.000273,...,0.000945,0.000162,0.000463,0.000383,0.001856,0.000383,0.000452,0.00035,0.000902,0.000452
topic_1,-8e-05,-0.000451,-0.0004,-0.000225,0.000268,-0.000225,-0.000225,-0.000225,-0.000956,-0.000225,...,-0.000764,-0.000122,-0.000274,-0.000241,-0.001797,-0.000241,-0.0004,-6.2e-05,-0.001203,-0.0004
topic_2,0.00011,-0.000337,-0.000384,-0.000168,-0.000259,-0.000168,-0.000168,-0.000168,-0.000733,-0.000168,...,0.000415,-9.8e-05,-0.000304,0.000304,-0.001107,0.000304,-0.000384,0.000759,-0.001097,-0.000384
topic_3,-0.000978,-0.000776,-0.000161,-0.000388,-0.003191,-0.000388,-0.000388,-0.000388,-0.000996,-0.000388,...,-0.000533,-0.000181,-0.00081,-0.000432,-0.00039,-0.000432,-0.000161,-0.000389,0.003104,-0.000161
topic_4,0.004717,-9.3e-05,-0.000324,-4.7e-05,0.000143,-4.7e-05,-4.7e-05,-4.7e-05,-0.000784,-4.7e-05,...,-7.3e-05,-8.4e-05,7.7e-05,-0.000292,-0.000223,-0.000292,-0.000324,0.001184,0.000397,-0.000324


In [21]:
for i in range(TOPIC_NUMBER):
    print(f"Top words for topic {i}")
    print(encoding_matrix.iloc[i].sort_values(ascending=False).head(5))
    print("\n")

Top words for topic 0
retrieved    0.229227
historic     0.224548
national     0.221534
register     0.177863
places       0.139340
Name: topic_0, dtype: float64


Top words for topic 1
historic    0.444433
register    0.349182
national    0.287509
places      0.273903
property    0.109765
Name: topic_1, dtype: float64


Top words for topic 2
species     0.261748
gbif        0.230339
wikidata    0.211266
col         0.177706
tree        0.163949
Name: topic_2, dtype: float64


Top words for topic 3
league      0.238313
uefa        0.226159
football    0.202289
team        0.180464
season      0.151612
Name: topic_3, dtype: float64


Top words for topic 4
kola        0.663947
mahalleh    0.233338
bala        0.190631
pain        0.187329
sar         0.136821
Name: topic_4, dtype: float64




# Finally assign the original documents to the topic name

In [30]:
# Assign the topic to each document ( Only consider the numerical columns)
print(documents_to_topics.columns)
documents_to_topics['topic'] = documents_to_topics.idxmax(axis=1, numeric_only=True)
documents_to_topics['topic_score'] = documents_to_topics.max(axis=1, numeric_only=True)
documents_to_topics['topic_text'] = [list(encoding_matrix.loc[topic].sort_values(ascending=False).index)[:5] for topic in documents_to_topics['topic']]
documents_to_topics['WIKI_TITLE'] = titles
documents_to_topics.head()

Index(['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'text', 'topic',
       'topic_score', 'topic_text', 'WIKI_TITLE'],
      dtype='object')


Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,text,topic,topic_score,topic_text,WIKI_TITLE
0,0.13491,-0.073099,0.005402,-0.054377,-0.021944,The Quick and the Dead is a war film direct...,topic_0,0.13491,"[retrieved, historic, national, register, places]",The Quick and the Dead (1963 film)
1,0.128489,-0.075861,-0.0709,0.242133,0.035059,During the – Portuguese football season FC Por...,topic_3,0.242133,"[league, uefa, football, team, season]",2008–09 FC Porto season
2,0.101753,-0.087392,0.415259,0.027033,-0.039876,Neoparaphytoseius sooretamus is a species of m...,topic_2,0.415259,"[species, gbif, wikidata, col, tree]",Neoparaphytoseius sooretamus
3,0.134862,-0.018495,-0.020146,-0.130587,0.018018,Sengundram known in English as Red Hills is a ...,topic_0,0.134862,"[retrieved, historic, national, register, places]",Sengundram
4,0.069815,-0.050815,0.230792,0.000797,-0.022529,Eleutherodactylus brevipalmatus Eleutherodacty...,topic_2,0.230792,"[species, gbif, wikidata, col, tree]",Eleutherodactylus cuneatus


# For example, for the document at index 0, the topic is retrived, history.. natinal