In [9]:
import re
import pandas as pd
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [10]:
def extract_relationships(text):
    """
    Extracts relationships of the type 'X loved Y' from the text, considering only proper names.
    """
    relationships = []

    # Clean and normalize the text
    cleaned_text = re.sub(r'\s+', ' ', text.strip())

    # Split into sentences/lines
    sentences = [s.strip() for s in cleaned_text.split('\n') if s.strip()]

    for sentence in sentences:
        words = sentence.split()
        i = 0
        while i < len(words) - 2:
            if words[i+1].lower() == 'amava':
                name1 = words[i]
                name2 = words[i+2]

                # Check if they are proper names and not common words
                if (
                    name1[0].isupper() and name2[0].isupper() and
                    name1.isalpha() and name2.isalpha() and
                    name1 not in ['Que', 'A', 'O', 'Da', 'Do', 'E', 'Tanto', 'Toda'] and
                    name2 not in ['Que', 'A', 'O', 'Da', 'Do', 'E', 'Tanto', 'Toda']
                ):
                    relationships.append((name1, name2))
            i += 1
    return relationships

def create_source_target_table(relationships):
    """
    Creates a source-target table from the extracted relationships.
    """
    return pd.DataFrame(relationships, columns=['From', 'To'])

def process_quadrilha_lyrics():
    """
    Processes the lyrics and generates a relationship table between characters.
    """
    text = """
    Carlos amava Dora que amava Lia que amava Léa que amava Paulo que amava Juca que amava Dora
    Carlos amava Dora que amava Rita que amava Dito que amava Rita que amava Dito que amava Rita
    Carlos amava Dora que amava Pedro que amava tanto que amava a Filha que amava Carlos que amava Dora que amava toda a quadrilha
    """

    relationships = []
    lines = [line.strip() for line in text.strip().split('\n') if line.strip()]

    for line in lines:
        # Split chain by "que amava"
        parts = re.split(r'\s+que\s+amava\s+', line, flags=re.IGNORECASE)

        # Handle the first part: "X loved Y"
        match = re.match(r'([A-ZÁÉÍÓÚÂÊÔÃÕÇ][a-záéíóúâêôãõç]+)\s+amava\s+([A-ZÁÉÍÓÚÂÊÔÃÕÇ][a-záéíóúâêôãõç]+)', parts[0])
        if match:
            current_person = match.group(2)
            relationships.append((match.group(1), current_person))

            for part in parts[1:]:
                # Look for the next name
                names = re.findall(r'\b[A-ZÁÉÍÓÚÂÊÔÃÕÇ][a-záéíóúâêôãõç]+\b', part)
                for name in names:
                    if name not in ['Que', 'A', 'O', 'Da', 'Do', 'E', 'Tanto', 'Toda']:
                        relationships.append((current_person, name))
                        current_person = name
                        break  # only consider the first valid name per segment

    df = create_source_target_table(relationships)
    return df, relationships


In [11]:
df, relationships = process_quadrilha_lyrics()
df = df.drop_duplicates(ignore_index=True)

In [12]:
# Get all unique names (the whole group)
all_people = set(df['From']).union(set(df['To']))

# Create new rows: Dora -> everyone, except herself
new_rows = pd.DataFrame({
    'From': 'Dora',
    'To': [person for person in all_people if person != 'Dora']
})

# Add these rows to the original table
df_expanded = pd.concat([df, new_rows], ignore_index=True)

# Remove duplicates
df_expanded = df_expanded.drop_duplicates(ignore_index=True)


In [13]:
# Create directed graph
G = nx.DiGraph()
G.add_edges_from(zip(df_expanded['From'], df_expanded['To']))

# Compute initial layout
raw_pos = nx.kamada_kawai_layout(G)

# Normalize coordinates
x_vals = np.array([coord[0] for coord in raw_pos.values()])
y_vals = np.array([coord[1] for coord in raw_pos.values()])

x_norm = (x_vals - x_vals.min()) / (x_vals.max() - x_vals.min()) if x_vals.max() != x_vals.min() else x_vals
y_norm = (y_vals - y_vals.min()) / (y_vals.max() - y_vals.min()) if y_vals.max() != y_vals.min() else y_vals

# Final normalized position dict
pos = {node: (x, y) for node, x, y in zip(raw_pos.keys(), x_norm, y_norm)}

# Build list with node and connection information
edge_data = []
for idx, row in df_expanded.iterrows():
    source = row['From']
    target = row['To']

    # Entry for the source node
    edge_data.append({
        "ID": idx * 2 + 1,
        "Node": source,
        "Relationship": f"{source} --> {target}",
        "LocationX": pos[source][0],
        "LocationY": pos[source][1],
        "CircleY": pos[source][1]
    })

    # Entry for the target node
    edge_data.append({
        "ID": idx * 2 + 2,
        "Node": target,
        "Relationship": f"{source} --> {target}",
        "LocationX": pos[target][0],
        "LocationY": pos[target][1],
        "CircleY": pos[target][1]
    })

# Final DataFrame with all node data used in the edges
final_df = pd.DataFrame(edge_data)


In [14]:
final_df

Unnamed: 0,ID,Node,Relationship,LocationX,LocationY,CircleY
0,1,Carlos,Carlos --> Dora,0.861149,0.523456,0.523456
1,2,Dora,Carlos --> Dora,0.464729,0.489675,0.489675
2,3,Dora,Dora --> Lia,0.464729,0.489675,0.489675
3,4,Lia,Dora --> Lia,0.754303,1.0,1.0
4,5,Lia,Lia --> Léa,0.754303,1.0,1.0
5,6,Léa,Lia --> Léa,0.3455,0.999496,0.999496
6,7,Léa,Léa --> Paulo,0.3455,0.999496,0.999496
7,8,Paulo,Léa --> Paulo,0.010943,0.805329,0.805329
8,9,Paulo,Paulo --> Juca,0.010943,0.805329,0.805329
9,10,Juca,Paulo --> Juca,0.067029,0.506005,0.506005
