In [3]:
import os 
import sys
import pandas as pd

# Input for this notebook is in the output folder of the ytd script in reports
input_folder = os.path.join(os.path.dirname("."), '..', 'output')

data = pd.read_csv(os.path.join(input_folder, 'td_ytd_report_21052025.csv'), delimiter=',')

In [4]:
data.head(5)

Unnamed: 0,title,link,category,pagepath,pubdate,views,active users,views per active user,average engagement time per active user,_yoast_wpseo_focuskw,_yoast_wpseo_metadesc,_yoast_wpseo_linkdex,diff_with_daily_benchmark_views,diff_with_daily_benchmark_active_users,diff_with_daily_benchmark_average_engagement_time_per_active_user,views_bucket,active_users_bucket,average_engagement_time_per_active_user_bucket
0,'Conversazioni con altre donne' di Filippo Conz,https://www.taxidrivers.it/304488/live-streami...,Recensioni,/304488/live-streaming-on-demand/paramount-str...,2025-01-01 06:00:03+00:00,29.0,23.0,1.26087,60.0,Conversazione con altre donne,'Conversazione con altre donne' e l'opera prim...,80.0,-8.0,-6.0,13.833333,Basso,Molto Basso,Alto
1,'Dune - Parte Due' - I nuovi figli del deserto,https://www.taxidrivers.it/342136/review/dune-...,Recensioni,/342136/review/dune-parte-due-i-nuovi-figli-de...,2025-01-01 08:30:14+00:00,,,,,dune parte due,"Anni dopo il primo capitolo, 'Dune - Parte Due...",77.0,,,,,,
2,'Il Signore degli Anelli - La guerra dei Rohir...,https://www.taxidrivers.it/418080/review/in-sa...,Recensioni / In Sala,/418080/review/in-sala/il-signore-degli-anelli...,2025-01-02 05:05:31+00:00,29.0,27.0,1.074074,42.333333,Il Signore degli Anelli - La guerra dei Rohirrim,Il Signore degli Anelli - La guerra dei Rohirr...,80.0,0.0,0.0,0.0,Medio,Medio,Medio
3,15 emozionanti film drammatici da aggiungere a...,https://www.taxidrivers.it/419437/guide-2/15-f...,Altro,/419437/guide-2/15-film-drammatici-da-non-perd...,2025-01-02 05:00:39+00:00,56.0,48.0,1.166667,48.541667,film drammatici,"Se siete amanti dei film drammatici, ecco per ...",71.0,27.0,21.0,6.208333,Molto Alto,Molto Alto,Alto
4,Gennaio 2025 su Prime Video: i migliori titoli...,https://www.taxidrivers.it/419529/live-streami...,Recensioni,/419529/live-streaming-on-demand/gennaio-2025-...,2025-01-01 19:21:48+00:00,411.0,358.0,1.148045,51.594972,gennaio 2025 prime video,I migliori titoli in uscita a gennaio 2025 su ...,76.0,374.0,329.0,5.428305,Molto Alto,Molto Alto,Alto


## An approach to news headlines clustering using K-means

K-Means is a widely used clustering algorithm that partitions data into 
𝑘
k clusters based on feature similarity.

Approach: Transform headlines into numerical vectors using techniques like Term Frequency-Inverse Document Frequency (TF-IDF), often incorporating n-grams (e.g., bigrams, trigrams) to capture structural patterns. Then, apply K-Means clustering to group similar headlines.

Example: A GitHub project demonstrates clustering news headlines using K-Means, where headlines are vectorized based on n-gram features.

Per un guida in italiano: https://www.diariodiunanalista.it/posts/come-calcolare-la-similarita-tra-testi-di-un-sito-web-con-tf-idf-in-python/


For a great explanation on **how to visualize text clusters in low-dimensional space: https://medium.com/@RobuRishabh/clustering-text-data-with-k-means-and-visualizing-with-t-sne-9bc1fe7d8fed

In [11]:
# https://www.diariodiunanalista.it/posts/raggruppamento-testuale-con-tf-idf/

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
import string

from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns


In [26]:
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manuel.deluzi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\manuel.deluzi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [None]:
ita_stopwords = stopwords.words('italian') # List of words
ita_stopwords[0:5] # Print the first 5 italian stop words

['ad', 'al', 'allo', 'ai', 'agli']

In [None]:
from sklearn.feature_extraction import TfidfVectorizer


remove_punctuation_map  = dict((ord(char), None) for char in string.punctuation)

def preprocess(text):
    text = text.translate(remove_punctuation_map) #https://www.w3schools.com/python/ref_string_translate.asp
    text = text.lower()
    text = text.strip()
    return nltk.word_tokenize(text)

vectorizer = TfidfVectorizer(tokenizer=preprocess, stop_words=ita_stopwords)

def compute_similarity(a, b):
  tfidf = vectorizer.fit_transform([a, b])
  return ((tfidf * tfidf.T).toarray())[0,1] # This computes the cosine similarity between the two vectors. Since TF-IDF vectors are normalized by default, the dot product of the two gives the cosine similarity.

titles = data['title'].values

M = np.zeros((titles.shape[0], titles.shape[0]))

for i, row in tqdm(enumerate(titles), total=titles.shape[0], desc='1st level'):
	for j, next_row in enumerate(titles):
		M[i, j] = compute_similarity(row, next_row) 




1st level:   0%|          | 0/2862 [00:00<?, ?it/s]

1st level: 100%|██████████| 2862/2862 [1:50:51<00:00,  2.32s/it]  


In [None]:
similarity_df = pd.DataFrame(M, columns=titles, index=titles)

similarity_df[similarity_df.columns.str.match(".*si farà.*", case=False)]


Unnamed: 0,'Conversazioni con altre donne' di Filippo Conz,'Dune - Parte Due' - I nuovi figli del deserto,'Il Signore degli Anelli - La guerra dei Rohirrim': uno spinoff bipolare,15 emozionanti film drammatici da aggiungere alla tua lista,Gennaio 2025 su Prime Video: i migliori titoli in arrivo!,'Attack on Titan: The Last Attack': la fine di una saga durata 10 anni,I nuovi film al cinema dal 1° gennaio e quelli ancora in sala,'Avicii - I'm Tim': La stella cadente della musica,'The Walking dead: the ones who live' Il ritorno di Rick Grimes,'Blink' - il documentario della National Geographic,...,Carla Simón: la vita che ispira l'arte,Unarchive Found Footage Fest: torna il cinema che brucia,'Il Mio Compleanno' conversazione con Silvia D'Amico,"Cannes 2025, 21 maggio: potere, politica e verita personali rubano la scena",Wes Anderson e Richard Ayoade: un nuovo film dal tono misterioso,Jodie Foster: il femminismo oltre le questioni di genere,Darren Aronofsky e Google DeepMind si tuffano nel cinema basato sull'intelligenza artificiale,"""Zootropolis 2"": ecco finalmente il trailer del sequel Disney. Nuovi personaggi e nuova avventura in arrivo per Judy Hopps e Nick Wilde",Quentin Tarantino diventa scrittore,'Garda Cinema Ff': il cinema che approda sulle rive del Garda


In [64]:
# Find most similar articles

def find_most_similar_articles(title, similarity_df, top_n=5):
    if title not in similarity_df.columns:
        raise ValueError("Title not found in similarity DataFrame.")
    # Get the similarity scores for the given title
    similar_scores = similarity_df[title]
    # Get the top N most similar articles
    most_similar = similar_scores.nlargest(top_n + 1).iloc[1:]
    return most_similar

find_most_similar_articles("'Modern Love Mumbai 2' si fara?", similarity_df)

'Cassandra 2': si fara?                   0.356300
'Storia della mia famiglia 2' si fara?    0.291219
'La terra sull'Abisso 2' si fara?         0.291219
'Ransom Canyon 2' si fara?                0.291219
'Il Giardiniere' stagione 2: si fara?     0.291219
Name: 'Modern Love Mumbai 2' si fara?, dtype: float64

In [65]:
# K-means
from sklearn.cluster import KMeans

# Number of clusters (choose based on your dataset)
num_clusters = 3

# Apply K-Means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(M)

# Output the cluster labels for each document
labels = kmeans.labels_
print("Cluster Labels:", labels)

# Assign cluster labels to the original DataFrame
data['cluster'] = labels

data[data['cluster'] == 2].head(20)

Cluster Labels: [1 1 1 ... 1 1 1]


Unnamed: 0,title,link,category,pagepath,pubdate,views,active users,views per active user,average engagement time per active user,_yoast_wpseo_focuskw,_yoast_wpseo_metadesc,_yoast_wpseo_linkdex,diff_with_daily_benchmark_views,diff_with_daily_benchmark_active_users,diff_with_daily_benchmark_average_engagement_time_per_active_user,views_bucket,active_users_bucket,average_engagement_time_per_active_user_bucket,cluster
25,'Dune 4' si fara? In cantiere anche una serie ...,https://www.taxidrivers.it/419794/latest-news/...,News,/419794/latest-news/dune-4-si-fara-in-cantiere...,2025-01-03 18:41:30+00:00,27.0,15.0,1.8,33.333333,Dune 4,Dune 4 si fara? Ecco quello che sappiamo al mo...,69.0,-12.0,-11.0,1.166667,Molto Basso,Molto Basso,Medio,2
66,"Alec Baldwin e Luisa Rubino, volto iconico del...",https://www.taxidrivers.it/420166/latest-news/...,News,/420166/latest-news/alec-baldwin-e-luisa-rubin...,2025-01-07 09:59:16+00:00,17.0,17.0,1.0,46.882353,Alec Baldwin,A Sorrento riprese per un lungometraggio che v...,63.0,-10.5,-5.5,5.350871,Molto Basso,Basso,Alto,2
100,'Senna': l'audiorecensione della nuova miniser...,https://www.taxidrivers.it/420445/review/in-sa...,Recensioni / In Sala,/420445/review/in-sala/podcast/senna-l-audiore...,2025-01-09 11:06:14+00:00,33.0,17.0,1.941176,13.294118,Senna,"Emanuela Nizzari ci racconta Senna, la miniser...",74.0,-2.0,-10.0,-20.831584,Basso,Molto Basso,Molto Basso,2
103,'ACAB: la serie' dal 15 gennaio su Netflix. Ec...,https://www.taxidrivers.it/420566/latest-news/...,News,/420566/latest-news/acab-la-serie-dal-15-genna...,2025-01-10 14:46:09+00:00,,,,,ACAB: la serie,'ACAB: la serie' e in arrivo il 15 gennaio su ...,82.0,,,,,,,2
112,'M. Il figlio del secolo' l'imperdibile serie ...,https://www.taxidrivers.it/395812/serie-tv/sky...,Serie TV,/395812/serie-tv/sky-serie-tv/m-il-figlio-del-...,2025-01-12 05:30:39+00:00,73.0,61.0,1.196721,78.655738,M. Il figlio del secolo,"A Venezia 2024 arriva M. Il figlio del secolo,...",63.0,35.0,33.5,43.803059,Molto Alto,Molto Alto,Molto Alto,2
114,"'American Primeval', la nuova perla western fi...",https://www.taxidrivers.it/420424/serie-tv/net...,Serie TV,/420424/serie-tv/netflix-serie-tv/american-pri...,2025-01-11 08:53:24+00:00,216.0,162.0,1.333333,50.716049,American Primeval,Su Netflix la serie originale American Primeva...,80.0,174.0,134.0,27.100832,Molto Alto,Molto Alto,Molto Alto,2
141,'Lockerbie' la serie Sky sull'attentato aereo,https://www.taxidrivers.it/420932/latest-news/...,News,/420932/latest-news/lockerbie-la-serie-sky-sul...,2025-01-13 17:37:44+00:00,13.0,12.0,1.083333,25.0,lockerbie,Dal 27 Gennaio 2025 su Sky la nuova serie orig...,79.0,-4.0,-5.0,-0.666667,Basso,Basso,Medio,2
174,'ACAB: la serie'. Gli occhi dietro i caschi,https://www.taxidrivers.it/421078/serie-tv/net...,Serie TV,/421078/serie-tv/netflix-serie-tv/acab-la-seri...,2025-01-16 06:03:49+00:00,94.0,79.0,1.189873,59.303797,ACAB: la serie,La recensione dei primi episodi di 'ACAB: la s...,83.0,61.0,53.5,23.377131,Molto Alto,Molto Alto,Molto Alto,2
180,"'Il conte di Montecristo', la serie: intervist...",https://www.taxidrivers.it/421329/magazine-2/i...,Speciali e Magazine,/421329/magazine-2/il-conte-di-montecristo-la-...,2025-01-16 13:52:11+00:00,,,,,il conte di Montecristo,Il regista della miniserie 'il conte di Montec...,82.0,,,,,,,2
198,'On call' - La nuova serie poliziesca prodott...,https://www.taxidrivers.it/420910/serie-tv/pri...,Serie TV,/420910/serie-tv/prime-video-serietv/on-call-l...,2025-01-17 09:29:56+00:00,73.0,58.0,1.258621,34.827586,On call,"""On call"" - La nuova serie poliziesca disponib...",79.0,37.0,30.0,1.952586,Molto Alto,Molto Alto,Medio,2


In [70]:
# Do the regression of diff_with_daily_benchmark_active_users against cluster label, one-hot-encoded

import statsmodels.api as sm

# One-hot encode the cluster labels
X = pd.get_dummies(data['cluster'], drop_first=True)
y = data['diff_with_daily_benchmark_active_users']

X = sm.add_constant(X)  # Adds a constant term to the predictor

model = sm.OLS(y, X).fit()

# Print the regression results
print(model.summary())


ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [67]:
!pip install statsmodels

Collecting statsmodels
  Using cached statsmodels-0.14.4-cp312-cp312-win_amd64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Using cached patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Using cached statsmodels-0.14.4-cp312-cp312-win_amd64.whl (9.8 MB)
Using cached patsy-1.0.1-py2.py3-none-any.whl (232 kB)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-1.0.1 statsmodels-0.14.4



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [36]:
# Save M matrix for later use
np.save('M_matrix.npy', M)




In [None]:
# Import M_matrix
M = np.load('M_matrix.npy')

(2862, 2862)