# Detecting Topic Merges and Splits in Dynamic Political Conversations

Cláudia Oliveira 

Supervisor - Prof. Dr. Álvaro Figueira

Faculty of Science, University of Porto

In [4]:
import spacy
import pandas as pd

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'tell', 'shake', 'turn', 'talk', 'say']
Sebastian Thrun PERSON
Google ORG
2007 DATE
American NORP
Thrun GPE
Recode ORG
earlier this week DATE


In [11]:
tweets = pd.read_csv("datasets/tweets.csv", encoding="utf-8", low_memory=False)

In [None]:
# Carregar modelo spaCy (inglês)
nlp = spacy.load("en_core_web_sm")

# Processar todos os tweets eficientemente
docs = list(nlp.pipe(tweets["text"].astype(str), batch_size=50))

# Extrair todas as entidades únicas
entidades_unicas = set()
for doc in docs:
    for ent in doc.ents:
        entidades_unicas.add(ent.text)

# Mostrar as entidades únicas
print("Número de entidades únicas:", len(entidades_unicas))
print(entidades_unicas)

230 000 entitites