# Imports

In [1]:
import pandas as pd
pd.set_option('display.max_rows', 100)  # None means no limit

# Import Topics DF
## (Results from BERTopic)

In [2]:
topics_df = pd.read_csv('BERTopic_results.csv')
topics_df = topics_df.drop(0) # drop cluster -1 from df


# Add Clusters & Content Similarities

In [12]:
cluster_df = pd.read_csv('clustering_results.csv')
categorized_df = pd.read_csv('../04_categorized_data/similarity_scored_paragraphs.csv')

cluster_df = cluster_df.reset_index(drop=True)
categorized_df = categorized_df.reset_index(drop=True)

# Merge the DataFrames on index
merged_df = pd.merge(cluster_df, categorized_df, left_index=True, right_index=True)

In [17]:
average_similarity = merged_df.groupby('cluster_x')['similarity_score'].mean().reset_index()

average_cat_sim_list = []

for i in range(1, 8):
    # Sort the DataFrame by 'similarity_score' in descending order
    average_sim = merged_df.groupby('cluster_x')[f'similarity_score_{i}'].mean().reset_index()
    average_cat_sim_list.append(average_sim)


In [19]:
# Merge topics_df with the average_similarity DataFrame
topics_df = pd.merge(topics_df, average_similarity, left_on='Topic', right_on='cluster_x', how='inner')
topics_df['avg_content_similarity'] = topics_df['similarity_score']
topics_df = topics_df.drop(columns=['similarity_score', 'cluster_x'])

# Merge with the category-specific similarity DataFrames
for i in range(1, 8):
    topics_df = pd.merge(topics_df, average_cat_sim_list[i-1], left_on='Topic', right_on='cluster_x', how='inner')
    topics_df[f'avg_cat{i}_similarity'] = topics_df[f'similarity_score_{i}']
    topics_df = topics_df.drop(columns=[f'similarity_score_{i}', 'cluster_x'])

# Add Centroids

In [20]:
centroids_df = pd.read_csv('centroid_analysis_results.csv')

euclidean_df = centroids_df[centroids_df['ranking_type'] == 'euclidean'].copy()
cosine_df = centroids_df[centroids_df['ranking_type'] == 'cosine'].copy()

In [21]:
# Euclidean Distance

# Sort the DataFrame by 'cluster' and 'distance_to_centroid' in ascending order
euclidean_df_sorted = euclidean_df.sort_values(by=['cluster', 'distance_to_centroid'], ascending=[True, True])

# Group by 'cluster' and take the top 3 rows with the lowest 'distance_to_centroid' for each group
top_3_lowest_distance_per_cluster = euclidean_df_sorted.groupby('cluster').head(3).reset_index(drop=True)

In [22]:
# Cosine Similarity

# Sort the DataFrame by 'cluster' and 'distance_to_centroid' in ascending order
cosine_df_sorted = cosine_df.sort_values(by=['cluster', 'cosine_similarity_to_centroid'], ascending=[True, False])

# Group by 'cluster' and take the top 3 rows with the lowest 'distance_to_centroid' for each group
top_3_highest_cosim_per_cluster = cosine_df_sorted.groupby('cluster').head(3).reset_index(drop=True)

# Prepare final DataFrame

In [23]:
# Initialize new columns in topics_df
topics_df['euclidean_1'] = ''
topics_df['euclidean_1_distance'] = ''
topics_df['euclidean_2'] = ''
topics_df['euclidean_2_distance'] = ''
topics_df['euclidean_3'] = ''
topics_df['euclidean_3_distance'] = ''
topics_df['cosine_1'] = ''
topics_df['cosine_1_similarity'] = ''
topics_df['cosine_2'] = ''
topics_df['cosine_2_similarity'] = ''
topics_df['cosine_3'] = ''
topics_df['cosine_3_similarity'] = ''

# Group the top_3_lowest_distance_per_cluster by 'cluster' to facilitate adding text and distance
grouped_euclidean = top_3_lowest_distance_per_cluster.groupby('cluster')

# Iterate over each cluster in top_3_lowest_distance_per_cluster
for cluster, group in grouped_euclidean:
    texts = group['text'].tolist()
    distances = group['distance_to_centroid'].tolist()

    # Find the corresponding row in topics_df and assign the text and distance values
    if len(texts) >= 1:
        topics_df.loc[topics_df['Topic'] == cluster, 'euclidean_1'] = texts[0]
        topics_df.loc[topics_df['Topic'] == cluster, 'euclidean_1_distance'] = distances[0]
    if len(texts) >= 2:
        topics_df.loc[topics_df['Topic'] == cluster, 'euclidean_2'] = texts[1]
        topics_df.loc[topics_df['Topic'] == cluster, 'euclidean_2_distance'] = distances[1]
    if len(texts) >= 3:
        topics_df.loc[topics_df['Topic'] == cluster, 'euclidean_3'] = texts[2]
        topics_df.loc[topics_df['Topic'] == cluster, 'euclidean_3_distance'] = distances[2]

# Group the top_3_highest_cosim_per_cluster by 'cluster' to facilitate adding text and similarity
grouped_cosine = top_3_highest_cosim_per_cluster.groupby('cluster')

# Iterate over each cluster in top_3_highest_cosim_per_cluster
for cluster, group in grouped_cosine:
    texts = group['text'].tolist()
    similarities = group['cosine_similarity_to_centroid'].tolist()

    # Find the corresponding row in topics_df and assign the text and similarity values
    if len(texts) >= 1:
        topics_df.loc[topics_df['Topic'] == cluster, 'cosine_1'] = texts[0]
        topics_df.loc[topics_df['Topic'] == cluster, 'cosine_1_similarity'] = similarities[0]
    if len(texts) >= 2:
        topics_df.loc[topics_df['Topic'] == cluster, 'cosine_2'] = texts[1]
        topics_df.loc[topics_df['Topic'] == cluster, 'cosine_2_similarity'] = similarities[1]
    if len(texts) >= 3:
        topics_df.loc[topics_df['Topic'] == cluster, 'cosine_3'] = texts[2]
        topics_df.loc[topics_df['Topic'] == cluster, 'cosine_3_similarity'] = similarities[2]


In [24]:
# Sort by Content Similarity

sorted_df = topics_df.sort_values(by=['avg_content_similarity'], ascending=[False])

# Select relevant columns

selected_df = sorted_df[['Name', 'Representative_Docs', 'avg_content_similarity', 'avg_cat1_similarity', 'avg_cat2_similarity',
       'avg_cat3_similarity', 'avg_cat4_similarity', 'avg_cat5_similarity',
       'avg_cat6_similarity', 'avg_cat7_similarity',]]

## Analyze Results

Here we try to find topics with a high similarity score in one of the categories and low similarity scores in all others.

In [25]:
similarity_cols = [f"avg_cat{i}_similarity" for i in range(1,8)]

threshold_high = 0.9
threshold_low = 0.85

# Count how many columns are "high" for each row
high_mask = (topics_df[similarity_cols] > threshold_high).sum(axis=1)

# Count how many columns are "low" for each row
low_mask = (topics_df[similarity_cols] < threshold_low).sum(axis=1)

# We want exactly 1 column above threshold_high,
# and the other 6 columns below threshold_low
condition = (high_mask == 1) & (low_mask == 6)

filtered_df = topics_df[condition]
filtered_df.head(20)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,euclidean_1,euclidean_1_distance,euclidean_2,euclidean_2_distance,euclidean_3,...,cosine_3,cosine_3_similarity,avg_content_similarity,avg_cat1_similarity,avg_cat2_similarity,avg_cat3_similarity,avg_cat4_similarity,avg_cat5_similarity,avg_cat6_similarity,avg_cat7_similarity
440,440,146,440_gesundheitskosten_salaire_gesundheitskommi...,"['gesundheitskosten', 'salaire', 'gesundheitsk...",['Angesichts der explodierenden Gesundheitskos...,"Es ist zum Sinnbild geworden, für alles, das s...",0.749098,Die Pandemie zeige die Schwachstellen unseres ...,0.983157,Der Fax verkam während der Pandemie zum Sinnbi...,...,10:07 Pandemie zeigt Lücken bei der Digitalisi...,0.485742,0.806149,0.844976,0.818339,0.829608,0.828727,0.839351,0.901993,0.809923
497,497,134,497_patient_patienten_akut_75,"['patient', 'patienten', 'akut', '75', '87000'...",['Die Auslastung der Intensivstationen in den ...,"4 In Gebieten, in die noch nicht so viel Geld ...",0.770335,Davon würden laut Buholzer alle profitieren: D...,0.776786,Das muss sich jetzt ändern. Das jedenfalls for...,...,"Es spricht vieles dafür, dass der Wert der med...",0.677046,0.805398,0.838464,0.815531,0.829176,0.84311,0.83366,0.900238,0.808341


Here we list the 20 Topics with the highest variance between similarity to two different categories.

In [26]:
similarity_cols = [f"avg_cat{i}_similarity" for i in range(1,8)]

# Calculate the difference between the max and min similarity in each row
topics_df["max_diff"] = topics_df[similarity_cols].max(axis=1) - topics_df[similarity_cols].min(axis=1)

# Sort by max_diff descending, so the rows with the biggest spread are at the top
topics_df.sort_values(by="max_diff", ascending=False, inplace=True)

# Now topics_df is sorted, and the first rows have the largest delta
topics_df.head(20)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,euclidean_1,euclidean_1_distance,euclidean_2,euclidean_2_distance,euclidean_3,...,cosine_3_similarity,avg_content_similarity,avg_cat1_similarity,avg_cat2_similarity,avg_cat3_similarity,avg_cat4_similarity,avg_cat5_similarity,avg_cat6_similarity,avg_cat7_similarity,max_diff
303,303,199,303_infizierten_italien_virus_gezählt,"['infizierten', 'italien', 'virus', 'gezählt',...","[""21:57 Italien: 7-Tage-Inzidenz ist gesunken ...",>> Lire:L'Hôpital de La Chaux-de-Fonds aura so...,1.270584,Mit seiner Forschung schuf Richard Ernst auch ...,1.296072,"Depuis le début de l'année, des interventions ...",...,0.70003,0.388634,0.440701,0.38251,0.413754,0.44744,0.417043,0.559691,0.374814,0.184877
233,233,228,233_gestorben_décédée_italiens_brasilianische,"['gestorben', 'décédée', 'italiens', 'brasilia...",['Italiens Fussball trauert um einen seiner eh...,Seit Anfang Jahr findet einmal im Monat in Flü...,1.1099,Brustkrebs Das Kantonsspital Baden und das Spi...,1.129966,THUN Die rosa Schleife ist das internationale ...,...,0.418718,0.276179,0.326903,0.280459,0.296919,0.335842,0.310601,0.448001,0.266953,0.181048
346,346,176,346_traurig_schrecklich_frère_sterben,"['traurig', 'schrecklich', 'frère', 'sterben',...",['L’infirmier en soins intensifs qui conduit l...,Thun Die Spital STS AG lädt zum nächsten Vortr...,1.094006,Herisau Im Rahmen eines öffentlichen Vortrags ...,1.121901,Burgdorf Der nächste Publikumsvortrag im Spita...,...,0.66641,0.022363,0.046084,-0.037347,0.048786,0.073912,0.011334,0.142534,-0.025641,0.179881
410,410,155,410_erschossen_geschossen_angeschossen_angesch...,"['erschossen', 'geschossen', 'angeschossen', '...",['Winterthur Ein 70-jähriger Mann hat sich am ...,Die St.Galler Spitäler kommen nicht zur Ruhe. ...,0.948056,Doch gerade bei der Herzchirurgie ziehen das d...,1.126188,"Es ist aussergewöhnlich, dass die beiden Verbä...",...,0.675895,0.194357,0.209326,0.121521,0.19512,0.261731,0.177715,0.300134,0.139808,0.178614
186,186,274,186_zivilen_schutz_sicherheit_uniformierten,"['zivilen', 'schutz', 'sicherheit', 'uniformie...",['Dass sich Spezialkräfte als medizinisches Pe...,"Mit Entsetzen lese ich in der bz, dassTamedia ...",0.949019,Eine Klinik im US-Bundesstaat Kalifornien muss...,0.963453,Ab Mitte Mai soll Da Vinci in der Viszeralchir...,...,0.464433,0.374627,0.428201,0.376501,0.394929,0.427229,0.404751,0.543572,0.365507,0.178065
222,222,236,222_entlassen_minister_general_präsident,"['entlassen', 'minister', 'general', 'präsiden...",['Zu Jahresbeginn war der Minister in die Krit...,Angehörige und Patienten hätten erstaunlich vi...,1.079002,"beantwortet. Wie mag es jemandem gehen, der na...",1.275854,Die Akutbehandlung im Spital von Patienten in ...,...,0.590145,0.432292,0.48708,0.448848,0.498458,0.485253,0.471853,0.603999,0.429867,0.174132
431,431,150,431_öffentlich_unumgänglich_zugänglichen_sport...,"['öffentlich', 'unumgänglich', 'zugänglichen',...","['Vom 6. August an werden viele soziale, kultu...",Der Infektiologe Fabian Tschumi ist Leitender ...,0.962318,Infektiologe Fabian Tschumi ist Leitender Arzt...,1.111013,"Andrée Friedli, Infektiologin am Kantonsspital...",...,0.654728,-0.307012,-0.282337,-0.378308,-0.299699,-0.28934,-0.316142,-0.206069,-0.362545,0.172238
347,347,176,347_marktführendes_marktbreite_konjunktursensi...,"['marktführendes', 'marktbreite', 'konjunkturs...","['Die Lage am US-Arbeitsmarkt verbessert sich,...",Das Kantonsspital Aarau (KSA) hat der Hirsland...,0.798135,Laut Kennern des Gesundheitswesens wäre es nic...,1.13368,"Seit einer Woche ist bekannt, dass das Kantons...",...,0.907997,0.324993,0.336395,0.250746,0.327829,0.387918,0.303507,0.418682,0.270202,0.167936
370,370,168,370_übergewichtigen_übergewichtige_übergewicht...,"['übergewichtigen', 'übergewichtige', 'übergew...",['Aktuell sind in der Schweiz mehr als 17 Proz...,Ces revêtements autodésinfectants utilisés pou...,1.005808,Die Tagesklinik des Spitals Muri hat den tradi...,1.096899,Auch die anderen Spitäler setzen auf zusätzlic...,...,0.44013,0.077153,0.153066,0.129148,0.147239,0.054801,0.159408,0.22168,0.103512,0.166879
312,312,192,312_getesteten_tests_testen_getestet,"['getesteten', 'tests', 'testen', 'getestet', ...",['Bis anhin hat der Bund Tests von Menschen oh...,"Und dann fing er an, die künstliche Bandscheib...",1.281987,"Und in der Schweiz? Recherchen zeigen, dass im...",1.327687,Die Staatsanwaltschaft des Kantons Bern ermitt...,...,0.547828,-0.169485,-0.136308,-0.189338,-0.171111,-0.096824,-0.151822,-0.025187,-0.191256,0.166069
