In [427]:
# imports
import numpy as np
import requests
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import rand_score as rand_score
from sklearn.metrics import adjusted_rand_score
import jarowinkler as jw
import kmedoids

# Getting data from parsing

In [311]:
base_url = 'https://time.com'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0'}

['https://time.com/6262272/biden-willow-alaska-oil-project/',
 'https://time.com/6261992/a-recent-move-by-the-white-house-gives-the-lgbtq-community-reason-to-worry/',
 'https://time.com/6262143/silicon-valley-bank-bailout-yellen/',
 'https://time.com/6262136/mike-pence-trump-jan-6/',
 'https://time.com/6261993/trump-grand-jury-new-york-possible-criminal-charges/',
 'https://time.com/6261717/donald-trump-testify-grand-jury-new-york/',
 'https://time.com/6261386/mental-health-congress-john-fetterman/',
 'https://time.com/6261434/biden-budget-plan-2024-campaign/',
 'https://time.com/6261284/mitch-mcconnell-hospitalized/',
 'https://time.com/6261173/oklahoma-rejects-marijuana-legalization-future/',
 'https://time.com/6261094/china-russia-tiktok-top-threats-to-us/',
 'https://time.com/6261171/rudy-giuliani-legal-liability-2020-election/',
 'https://time.com/6261102/ursula-von-der-leyen-us-europe-climate-change/',
 'https://time.com/6261045/uk-illegal-migration-bill/',
 'https://time.com/626

In [312]:
page = requests.get(base_url, headers=headers)
soup = BeautifulSoup(page.text, "html.parser")
categories_links = soup.find('section', class_='menu-section').findAll('a')

In [313]:
categories_headers = [categories_links[i].text.strip() for i in range(1, 6)]
categories_links = [categories_links[i]['href'] for i in range(1, 6)]
articles_names = []
articles = []
for link in categories_links:
    page = requests.get(link, headers=headers)
    soup = BeautifulSoup(page.text, "html.parser")
    news = soup.findAll('div', class_='taxonomy-tout')
    refs = [base_url + article.find('a')['href'] for article in news]

    article_headers = []
    for ref in refs:
        page = requests.get(ref, headers=headers)
        bs = BeautifulSoup(page.text, 'html.parser')
        body = bs.find('div', class_='article content body clearfix')
        unwanted = bs.findAll(True,
                              class_=['component inline-article-recirc', 'author-feedback-text', 'article-bottom'])
        [u.extract() for u in unwanted]
        articles.append(body.text)

        header = bs.find('h1', class_='headline heading-content margin-8-top')
        article_headers.append(header.text)

    articles_names.append(article_headers)

In [314]:
headers_array = np.array([np.array(art) for art in articles_names]).reshape(-1)
articles_array = np.array(articles)

# Text processing

In [315]:
def nospecial(text):
    import re
    text = re.sub("[^A-Z]+", " ", text)
    return text

In [472]:
articles_array_no_special = np.array(
    [np.array(sorted(set(articles_names := nospecial(art.upper()).split()), key=articles_names.index)) for art in
     articles_array])

  articles_array_no_special = np.array(


In [473]:
articles_array_joined = np.array([' '.join(art) for art in articles_array_no_special])

### Jaccard distance

In [474]:
def jaccard_distance(s1: str, s2: str) -> float:
    set1 = set(s1.split())
    set2 = set(s2.split())
    return 1 - float(len(set1.intersection(set2)) / len(set1.union(set2)))

In [475]:
jaccard_matrix = [[jaccard_distance(j, i) for j in articles_array_joined] for i in articles_array_joined]

# Clustering

In [476]:
km = kmedoids.KMedoids(5, method='fasterpam')
c = km.fit(jaccard_matrix)
labels_jaccard_pam = c.labels_
c.inertia_

202.2639096837628

In [477]:
cleaned_jaccard_matrix = [[i for i in arr if not i in ENGLISH_STOP_WORDS] for arr in articles_array_no_special]

In [478]:
articles_cleared_joined = np.array([' '.join(art) for art in cleaned_jaccard_matrix])

In [479]:
jaccard_matrix_cleaned = [[jaccard_distance(j, i) for j in articles_cleared_joined] for i in articles_cleared_joined]
jaccard_matrix_cleaned

[[0.0,
  0.9401408450704225,
  0.88996138996139,
  0.9320987654320988,
  0.8888888888888888,
  0.9041811846689896,
  0.8880407124681934,
  0.8903107861060329,
  0.8768472906403941,
  0.8553719008264463,
  0.8937784522003035,
  0.8853211009174312,
  0.8702594810379242,
  0.8803894297635605,
  0.9125188536953243,
  0.8957528957528957,
  0.865702479338843,
  0.9077102803738317,
  0.8932203389830509,
  0.9177570093457944,
  0.8972602739726028,
  0.9122055674518201,
  0.9006928406466512,
  0.9064124783362218,
  0.8670309653916212,
  0.8728971962616823,
  0.9165596919127086,
  0.8895582329317269,
  0.881159420289855,
  0.8849028400597907,
  0.9045020463847203,
  0.8769771528998243,
  0.9038112522686026,
  0.8761609907120743,
  0.8932874354561101,
  0.8902439024390244,
  0.86875,
  0.9044414535666218,
  0.8830128205128205,
  0.9083557951482479,
  0.9,
  0.9058823529411765,
  0.8915989159891599,
  0.9078125,
  0.9105882352941177,
  0.8781302170283807,
  0.9027027027027027,
  0.9054290718038529

In [480]:
km = kmedoids.KMedoids(5, method='fasterpam')
c = km.fit(jaccard_matrix_cleaned)
c.inertia_

202.2639096837628

In [481]:
labels_jaccard_cleared_pam = c.labels_

In [482]:
km = KMeans(n_clusters=5)
km.fit(jaccard_matrix_cleaned)
labels_jaccard_cleared_km = km.labels_

In [483]:
km = KMeans(n_clusters=5)
km.fit(jaccard_matrix)
labels_jaccard_km = km.labels_

### Jaro-Winkler distance

In [484]:
articles_array_sorted = [' '.join(np.sort(arr)) for arr in articles_array_no_special]
len(articles_array_sorted)

250

In [485]:
jaro_matrix = [[jw.jarowinkler_similarity(j, i) for j in articles_array_sorted] for i in articles_array_sorted]
jaro_matrix

[[1.0,
  0.5860587913326251,
  0.8609800360432887,
  0.6262075235613553,
  0.8315058477809075,
  0.8363650549336588,
  0.8419185891066275,
  0.8246015279137691,
  0.8659566015311672,
  0.8706803575764974,
  0.6916293789702155,
  0.6844602561219135,
  0.865225472548722,
  0.6573080182564802,
  0.6894979141321471,
  0.8610576705048163,
  0.8772879204840183,
  0.6287671224433273,
  0.8309861574033173,
  0.6056610320074229,
  0.8392039449499802,
  0.8743309883996296,
  0.8621079996502845,
  0.837948025024988,
  0.8503201838415342,
  0.850875640824772,
  0.6510491184204671,
  0.8663358242960032,
  0.6677725118483412,
  0.6731551358823956,
  0.668058631201288,
  0.8353018271821111,
  0.8520948791455568,
  0.6810052618392547,
  0.8310977007985618,
  0.8368723441872643,
  0.879611698483279,
  0.6498209729255722,
  0.8237877350964501,
  0.8223165189327468,
  0.8443457787086077,
  0.8060903461491682,
  0.6536444465049152,
  0.7923096626037802,
  0.862456416497381,
  0.8220919587347134,
  0.85135

In [486]:
km = kmedoids.KMedoids(5, method='fasterpam')
c = km.fit(jaro_matrix)
labels_jaro_pam = c.labels_

In [487]:
articles_cleaned_sorted = [' '.join(np.sort(arr)) for arr in cleaned_jaccard_matrix]
articles_cleaned_sorted

['A ABOUT ACCORDING ACRES ACTIVISTS ADMINISTRATION ADVANCED ADVISERS AFTER AGENCY ALASKA ALASKANS ALLOW ALLOWING ALSO AN AND ANNOUNCED ANNOUNCEMENT ANY APPEARED APPROVING ARCTIC AS AT BARRELS BE BEEN BEFORE BEING BIDEN BILLIONS BIPARTISAN BREAK BUREAU BUT CALLING CAMPAIGN CASE CHOICES CLEAN CLIMATE COMES COMPANY CONCERNS CONDEMNATION CONGRESSIONAL CONOCOPHILLIPS CONSEQUENTIAL CONSIDERED CONSIDERS CONSTRUCTION CONTROVERSIAL COULD CREATE DAY DEB DECISION DELEGATION DEMOCRATIC DEPARTMENT DESIGNATED DESTROY DEVELOPER DEVELOPMENT DID DIFFERENT DIRECT DOLLARS DRAW DRILL DRILLING DURING EARLY EMISSIONS ENERGY ENJOYS ENVIRONMENTAL ENVIRONMENTALISTS EXISTING EXPECTED FACE FAMILIAR FEBRUARY FEDERAL FEDERALLY FINAL FLIES FOR FORWARD FROM FURTHER GAS GENERATE GOVERNMENTS GREENHOUSE GREENLIGHTING GROUPS HAALAND HAD HAS HAVE HIS IDENTIFIED IMPACTS IN INCLUDE INCLUDING INDIGENOUS INDIRECT INITIALLY INTERIOR IS IT JOBS JOE KILL KNOWLEDGE LAND LANDS LAST LAWMAKERS LEASES LEGACY LIFE LIKELY LIMITS LITIG

In [488]:
jaro_matrix_cleaned = [[jw.jarowinkler_similarity(j, i) for j in articles_cleaned_sorted] for i in
                       articles_array_sorted]
jaro_matrix_cleaned

[[1.0,
  0.5860587913326251,
  0.8609800360432887,
  0.6262075235613553,
  0.8315058477809075,
  0.8363650549336588,
  0.8419185891066275,
  0.8246015279137691,
  0.8659566015311672,
  0.8706803575764974,
  0.6916293789702155,
  0.6844602561219135,
  0.865225472548722,
  0.6573080182564802,
  0.6894979141321471,
  0.8610576705048163,
  0.8772879204840183,
  0.6287671224433273,
  0.8309861574033173,
  0.6056610320074229,
  0.8392039449499802,
  0.8743309883996296,
  0.8621079996502845,
  0.837948025024988,
  0.8503201838415342,
  0.850875640824772,
  0.6510491184204671,
  0.8663358242960032,
  0.6677725118483412,
  0.6731551358823956,
  0.668058631201288,
  0.8353018271821111,
  0.8520948791455568,
  0.6810052618392547,
  0.8310977007985618,
  0.8368723441872643,
  0.879611698483279,
  0.6498209729255722,
  0.8237877350964501,
  0.8223165189327468,
  0.8443457787086077,
  0.8060903461491682,
  0.6536444465049152,
  0.7923096626037802,
  0.862456416497381,
  0.8220919587347134,
  0.85135

In [489]:
km = kmedoids.KMedoids(5, method='fasterpam')
c = km.fit(jaro_matrix_cleaned)
labels_jaro_cleaned_pam = c.labels_

In [490]:
km = KMeans(n_clusters=5)
km.fit(jaro_matrix)
labels_jaro_km = km.labels_

In [491]:
km = KMeans(n_clusters=5)
km.fit(jaro_matrix_cleaned)
labels_jaro_cleaned_km = km.labels_

## Rand scores

In [492]:
labels_df = pd.DataFrame(
    {'jac_pam': labels_jaccard_pam,
     'jac_pam_clean': labels_jaccard_cleared_pam,
     'jac_km': labels_jaccard_km,
     'jac_km_clean': labels_jaccard_cleared_km,
     'jaro_pam': labels_jaro_pam,
     'jaro_pam_clean': labels_jaro_cleaned_pam,
     'jaro_km': labels_jaro_km,
     'jaro_km_clean': labels_jaro_cleaned_pam,
     'real': [i for i in range(5) for j in range(50)]
     })

In [493]:
rand_scores = [rand_score(labels_true=labels_df['real'], labels_pred=labels_df[key]) for key in labels_df.keys()]
rand_scores

[0.610570281124498,
 0.610570281124498,
 0.6027951807228916,
 0.6380722891566265,
 0.39784738955823296,
 0.39784738955823296,
 0.6558393574297189,
 0.39784738955823296,
 1.0]

In [494]:
adjusted_rand_scores = [adjusted_rand_score(labels_true=labels_df['real'], labels_pred=labels_df[key]) for key in
                        labels_df.keys()]
adjusted_rand_scores

[0.018780224942658966,
 0.018780224942658966,
 0.03096737150363436,
 0.08681995802279349,
 0.0007592653497502518,
 0.0007592653497502518,
 0.000622716798741078,
 0.0007592653497502518,
 1.0]

# Conclusion

In [495]:
scores_df = pd.DataFrame(zip(rand_scores, adjusted_rand_scores), columns=['rand_score', 'adjusted_rand_score'],
                         index=labels_df.keys()).drop(index='real')
scores_df

Unnamed: 0,rand_score,adjusted_rand_score
jac_pam,0.61057,0.01878
jac_pam_clean,0.61057,0.01878
jac_km,0.602795,0.030967
jac_km_clean,0.638072,0.08682
jaro_pam,0.397847,0.000759
jaro_pam_clean,0.397847,0.000759
jaro_km,0.655839,0.000623
jaro_km_clean,0.397847,0.000759


In [496]:
adjusted_rand_scores = [[adjusted_rand_score(labels_true=labels_df[k], labels_pred=labels_df[key]) for key in
                        labels_df.keys()] for k in labels_df.keys()]
adjusted_rand_scores

[[1.0,
  1.0,
  0.1879797299457312,
  0.11801719887049014,
  0.04428683263025555,
  0.050740849799107604,
  0.13070950182278257,
  0.050740849799107604,
  0.018780224942658966],
 [1.0,
  1.0,
  0.1879797299457312,
  0.11801719887049014,
  0.04428683263025555,
  0.050740849799107604,
  0.13070950182278257,
  0.050740849799107604,
  0.018780224942658966],
 [0.1879797299457312,
  0.1879797299457312,
  1.0,
  0.4790250421218196,
  0.22107609504164427,
  0.22107609504164427,
  0.21426717011798294,
  0.22107609504164427,
  0.03096737150363436],
 [0.11801719887049014,
  0.11801719887049014,
  0.4790250421218196,
  1.0,
  0.1754309159670197,
  0.17312716774034378,
  0.14002292132966246,
  0.17312716774034378,
  0.08681995802279349],
 [0.04428683263025555,
  0.04428683263025555,
  0.22107609504164427,
  0.1754309159670197,
  1.0,
  0.9426716421351305,
  0.2499540249718939,
  0.9426716421351305,
  0.0007592653497502518],
 [0.050740849799107604,
  0.050740849799107604,
  0.22107609504164427,
  0.

In [498]:
pd.DataFrame(adjusted_rand_scores, columns=labels_df.keys(), index=labels_df.keys())

Unnamed: 0,jac_pam,jac_pam_clean,jac_km,jac_km_clean,jaro_pam,jaro_pam_clean,jaro_km,jaro_km_clean,real
jac_pam,1.0,1.0,0.18798,0.118017,0.044287,0.050741,0.13071,0.050741,0.01878
jac_pam_clean,1.0,1.0,0.18798,0.118017,0.044287,0.050741,0.13071,0.050741,0.01878
jac_km,0.18798,0.18798,1.0,0.479025,0.221076,0.221076,0.214267,0.221076,0.030967
jac_km_clean,0.118017,0.118017,0.479025,1.0,0.175431,0.173127,0.140023,0.173127,0.08682
jaro_pam,0.044287,0.044287,0.221076,0.175431,1.0,0.942672,0.249954,0.942672,0.000759
jaro_pam_clean,0.050741,0.050741,0.221076,0.173127,0.942672,1.0,0.246669,1.0,0.000759
jaro_km,0.13071,0.13071,0.214267,0.140023,0.249954,0.246669,1.0,0.246669,0.000623
jaro_km_clean,0.050741,0.050741,0.221076,0.173127,0.942672,1.0,0.246669,1.0,0.000759
real,0.01878,0.01878,0.030967,0.08682,0.000759,0.000759,0.000623,0.000759,1.0
