In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from top2vec import Top2Vec
from pprint import pprint
import random
random.seed(42)

In [2]:
## define parameters

umap_args = {'n_neighbors': 30, # when replicating, consider setting this to 15 (check umap website), as currently no outliers are removed
             'n_components': 5,
             'metric': 'cosine',
             "random_state": 42}

hdbscan_args = {'min_cluster_size': 300,
                'min_samples':5,
                'metric': 'euclidean',
                'cluster_selection_method': 'eom'}

In [2]:
import os
os.chdir("C://Users/nicol/Dropbox/PhD/Papers/FrameCompetition")

In [3]:
## load data
documents = pd.read_csv("data/raw/media/bert_crime_clean.csv", encoding="utf-8")["text"].to_list()

In [6]:
## define model
model = Top2Vec(documents= documents, 
                # speed='deep-learn', 
                workers=7, 
                min_count = 100, 
                # embedding_model='distiluse-base-multilingual-cased', 
                umap_args = umap_args, 
                hdbscan_args = hdbscan_args)


NameError: name 'documents' is not defined

In [28]:
## assess number of topics
topic_nums = model.get_num_topics()
topic_nums

62

In [None]:
model.save("models/t2v/migration_mindocs300")

In [27]:
model = Top2Vec.load("models/t2v/migration_mindocs300")

In [9]:
for topic in range(topic_nums):
    documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=topic, num_docs=5)
    print(f"Topic: {topic}")
    print(model.topic_words[topic][:10])
    for doc, score, doc_id in zip(documents, document_scores, document_ids):
        print(f"Document: {doc_id}, Score: {score}")
        print("-----------")
        pprint(doc)
        print("-----------")

Topic: 0
['erzahlen' 'filme' 'film' 'erzahlt' 'geschichten' 'roman' 'meine'
 'kindheit' 'mein' 'freunde']
Document: 6745, Score: 0.5611928105354309
-----------
('In unserer taz-Videoserie „Zuflucht Berlin“ erzählt jeden Dienstag ein '
 'Flüchtling von seiner Ankunft, seinen Erfahrungen, Erwartungen und Träumen '
 'in Berlin.Wenn Obaid Alyousouf (Foto) von seiner Musik erzählt, wird sein '
 'Blick weit und das Zimmer im Wedding sehr klein. Bis zu acht Stunden hat der '
 '28-Jährige täglich auf seiner Oud geübt, einer arabischen Kurzhalslaute. Ein '
 'besonderes Instrument, das Obaid in Syrien zurücklassen musste. „Es ist '
 'unmöglich, es auf der Flucht mitzunehmen.“ Statt Musik zu machen, wartet '
 '\xadObaid nun. Darauf, registriert zu werden. Seit drei Wochen ist er in der '
 'Stadt, jeden Tag steht er am Lageso und schaut, ob sein Name auf einem '
 'Zettel steht. Fragt Obaid vor Ort, wie lange er warten muss, zucken die '
 'Verantwortlichen mit den Schultern. Keiner weiß es. Also wa

In [29]:
## generate a dict with 62 topic labels, with keys 1-62 and assign a different category to each topic
topic_labels = {0: "Episodisches Framing",
                1: "Historischer Kontext",
                2: "Unionsstreit 1",
                3: "Bürgerkrieg in Syrien",
                4: "Arbeitsmarkt",
                5: "Finanzierung der Migration",
                6: "Flüchtlingsunterbringung in Hamburg",
                7: "Humanitäre Katastrophe im Mittelmeer",
                8: "Bundesamt für Migration und Flüchtlinge (BAMF)",
                9: "Trump, Mexico, Migration",
                10: "EU Verteilung Flüchtlinge",
                11: "Abschiebung 1",
                12: "Unionsstreit 2",
                13: "Aktienkurse - Dow Jones",
                14: "Aktienkurse - DAX",
                15: "AfD Topic",
                16: "Lage in Griechenland (Ankunft & Lager)",
                17: "EU-Türkei-Deal",
                18: "Kriminalität: Gerichtsverfahren",
                19: "Flüchtlingsunterbringung in Berlin",
                20: "Brandanschläge auf Flüchtlingsheime",
                21: "Grenzkontrollen durch Polizei",
                22: "Verhandlung einer Abschiebung",
                23: "Einwanderungszahlen",
                24: "Migrationspolitik in Afrika",
                25: "Koalitionsstreit über Transitzonen",
                26: "Bundesrat Entscheidung zu sicheren Herkunftsstaate",
                27: "Einwanderung in Sozialsysteme",
                28: "Familiennachzug",
                29: "Flüchtlingsprotest am Oranienplatz in Berlin",
                30: "Abschiebung nach Afghanistan",
                31: "Seenotrettung 1",
                32: "Schulen",
                33: "Kirchen",
                34: "Balkanroute",
                35: "Jamaikasondierung",
                36: "Österreich",
                37: "Umfragen",
                38: "Fachkräftemangel",
                39: "Kriminalität: Straftaten",
                40: "Jugendliche Flüchtlinge",
                41: "Marineeinsatz gegen Schleuser im Mittelmeer",
                42: "Europäische Aktienmärkte",
                43: "Demonstrationen",
                44: "Boris Pistorius",
                45: "Jesiden",
                46: "Gesundheitsversorgung von Flüchtlingen",
                47: "Ankunft in München",
                48: "Aktien: DAX",
                49: "Grenzkontrollen Schengen",
                50: "Brexit",
                51: "Viktor Orban's Asylpolitik", 
                52: "Unterkünfte in Schleswig-Holstein",
                53: "Calais",
                54: "Doppelte Staatsbürgerschaft",
                55: "Seenotrettung 2",
                56: "Grenzkontrollen in Skandinavien",
                57: "Börse: Anleihen",
                58: "UN-Migrationspakt",
                59: "Lage an den spanischen Enklaven Ceuta und Melilla",
                60: "Schweiz",
                61: "Börse: Osteuropa"}


## write topic labels to json
import json
with open('data/processed/topic_labels.json', 'w') as fp:
    json.dump(topic_labels, fp)

## topic reduction

## merge topic labels
reduced_labels = {"Episodic Framing": [0],
                "Historical Context": [1],
                "Coalition Conflict": [2, 12, 25, 35],
                "Syrian Civil War": [3, 45],
                "Labor Market": [4, 38],
                "Budget for Integration": [5],
                "Housing in Germany": [6, 19, 52],
                "Humanitarian Crisis Mediterranean": [7, 31,  55],
                "BAMF": [8],
                "Trump, Mexico, Migration": [9],
                "EU Distribution of Refugees": [10],
                "Deportation": [11, 22, 30],
                "Stock Market": [13, 14, 42, 48, 57, 61],
                "AfD Topic": [15],
                "Refugee Arrival": [16],
                "EU-Turkey Deal": [17],
                "Refugee Crime": [18, 39],
                "Attacks on Refugee Homes": [20],
                "Police Action Against Human Trafficking": [21],
                "Immigration Statistics": [23],
                "Migration Policy in Africa": [24],
                "Decision on Safe Countries of Origin": [26],
                "Welfare Migration": [27],
                "Family Reunification": [28],
                "Refugee Protest Berlin": [29],
                "Refugees in Schools": [32],
                "Church Debate on Immigration": [33],
                "Balkan Route": [34],
                "Austrian Immigration Debate": [36],
                "Polls": [37],
                "Young Refugees": [40],
                "Marine Operation in Mediterranean": [41],
                "Demonstrations": [43],
                "Boris Pistorius": [44],
                "Healthcare for Refugees": [46],
                "Arrival in Munich": [47],
                "Schengen Border Control": [49, 56],
                "Brexit": [50],
                "Viktor Orban's Asylum Policy": [51],
                "Humanitarian Crisis EU Borders": [53, 59],
                "Double Citizenship": [54],
                "UN Migration Pact": [58],
                "Swiss Immigration Debate": [60]}

reduced_ids = {0: [0],
                1: [1],
                2: [2, 12, 25, 35],
                3: [3, 45],
                4: [4, 38],
                5: [5],
                6: [6, 19, 52],
                7: [7, 31,  55],
                8: [8],
                9: [9],
                10: [10],
                11: [11, 22, 30],
                12: [13, 14, 42, 48, 57, 61],
                13: [15],
                14: [16],
                15: [17],
                16: [18, 39],
                17: [20],
                18: [21],
                19: [23],
                20: [24],
                21: [26],
                22: [27],
                23: [28],
                24: [29],
                25: [32],
                26: [33],
                27: [34],
                28: [36],
                29: [37],
                30: [40],
                31: [41],
                32: [43],
                33: [44],
                34: [46],
                35: [47],
                36: [49, 56],
                37: [50],
                38: [51],
                39: [53, 59],
                40: [54],
                41: [58],
                42: [60]}

## write reduced labels to json
import json
with open('data/processed/reduced_topic_labels.json', 'w') as fp:
    json.dump(reduced_labels, fp)

In [30]:
## load document metadata
meta = pd.read_csv("data/raw/media/bert_crime_clean.csv", encoding="utf-8").drop('text', axis=1)

In [31]:
## merge topic ids
meta['topic_id'] = model.get_documents_topics(model.document_ids)[0]

In [32]:
meta['reduced_topic_id'] = meta['topic_id'].apply(lambda x: [i for i, j in reduced_ids.items() if x in j])

In [33]:
meta.reduced_topic_id = meta.reduced_topic_id.apply(lambda x: x[0])

In [34]:
## add labels
meta['topic_label'] = [topic_labels[k] for k in meta.topic_id]

In [35]:
meta['reduced_topic_label'] = meta['topic_id'].apply(lambda x: [i for i, j in reduced_labels.items() if x in j])
meta.reduced_topic_label = meta.reduced_topic_label.apply(lambda x: x[0])

In [36]:
meta.reduced_topic_label.value_counts(normalize=True)

Coalition Conflict                         0.082692
Stock Market                               0.071349
Episodic Framing                           0.054685
Housing in Germany                         0.054031
Deportation                                0.052818
Humanitarian Crisis Mediterranean          0.048675
Labor Market                               0.041661
Syrian Civil War                           0.040518
Historical Context                         0.040121
Refugee Crime                              0.030295
Budget for Integration                     0.029058
BAMF                                       0.027132
Trump, Mexico, Migration                   0.026736
EU Distribution of Refugees                0.025043
AfD Topic                                  0.020084
Refugee Arrival                            0.019850
EU-Turkey Deal                             0.019150
Attacks on Refugee Homes                   0.016116
Police Action Against Human Trafficking    0.015999
Immigration 

In [37]:
## add topic similarity to document data (takes a few minutes)
doc_top_sim = model.get_documents_topics(model.document_ids, num_topics=62)[1]

In [None]:
doc_top_sim.shape

(85691, 62)

In [None]:
doc_top_sim = pd.DataFrame(doc_top_sim, columns=[f"Association (ot): {t}" for t in topic_labels.values()])

In [None]:
## merge topic similarities
meta = pd.concat([meta.drop("V1", axis = 1), doc_top_sim], axis=1)

In [None]:
## generate reduced topic vectors
rt_vectors = pd.DataFrame()
for rt in reduced_ids.keys():
    ots = reduced_ids[rt] # get original topic ids
    ot_vectors = model.topic_vectors[ots] # get original topic vectors
    rt_vector = np.mean(ot_vectors, axis=0) # calculate reduced topic vector
    rt_vectors = pd.concat([rt_vectors, pd.DataFrame(rt_vector).T], axis=0) # add to dataframe

In [None]:
## save
rt_vectors.to_csv("data/processed/embeddings/reduced_topics.csv", index=False)

In [None]:
## estimate cosin sim to reduced topics
from scipy.spatial import distance
cos_sim = 1 - distance.cdist(model.document_vectors, rt_vectors, 'cosine')

In [None]:
rt_vectors.shape

(42, 300)

In [None]:
cos_sim = pd.DataFrame(cos_sim, columns=[f"Association (reduced): {t}" for t in reduced_labels.keys()])

ValueError: Shape of passed values is (85691, 42), indices imply (85691, 43)

In [None]:
## merge
meta = pd.concat([meta, pd.DataFrame(cos_sim)], axis=1)

In [None]:
## drop stock market topic
# meta = meta[meta.reduced_topic_label != "Stock Market"]

In [None]:
## write to csv
meta.to_csv("data/processed/media/docs_topics_sims.csv", index=False)

In [None]:
# docvecs = pd.DataFrame(model.document_vectors[meta.reduced_topic_label != "Stock Market"])

In [None]:
docvecs.to_csv("data/processed/embeddings/documents.csv", index=False)