In [1]:
import pandas as pd
import random
import networkx as nx

df = pd.read_csv("/content/drive/MyDrive/Vezilka/people.csv")

# Основно чистење (важни колони)
df = df.dropna(subset=[
    "personLabel",
    "genderLabel",
    "birthPlaceLabel",
    "occupationLabel",
    "educationLabel",
    "birthDate"
]).reset_index(drop=True)

In [2]:
df['birthDate'] = pd.to_datetime(df['birthDate']).dt.strftime('%Y-%m-%d')
df['deathDate'] = pd.to_datetime(df['deathDate'], errors='coerce').dt.strftime('%Y-%m-%d')

In [3]:
months_mk = [
    "", "јануари", "февруари", "март", "април", "мај", "јуни",
    "јули", "август", "септември", "октомври", "ноември", "декември"
]

In [4]:
def format_date_mk(date_str):
    if pd.isna(date_str) or date_str in ["NaT", ""]:
        return "—"
    try:
        date = pd.to_datetime(date_str)
        return f"{date.day} {months_mk[date.month]} {date.year}"
    except:
        return date_str

In [5]:
def rod_suffix(gender):
    if not isinstance(gender, str):
        return ""
    if gender.lower().startswith("ж"):
        return "а"
    return ""

def weighted_sample(series):
    return series.value_counts(normalize=True).sample(1).index[0]

In [6]:

first_names_male = ["Иван", "Горан", "Марко", "Дејан", "Александар"]
first_names_female = ["Марија", "Ана", "Елена", "Софија", "Јелена"]
last_names = ["Петровски", "Иванов", "Марковски", "Димитров", "Александров"]

def generate_macedonian_name(gender):
    if gender.lower().startswith("ж"):
        first = random.choice(first_names_female)
    else:
        first = random.choice(first_names_male)
    last = random.choice(last_names)
    return f"{first} {last}"


In [7]:
def generate_biographies(input_df):
    bios = []

    for _, row in input_df.iterrows():
        template = random.choice(VARIATION_TEMPLATES)

        bio = template.format(
            name=row["personLabel"],
            birthDate=row["birthDate"],
            birthPlace=row["birthPlaceLabel"],
            occupation=row["occupationLabel"],
            education=row["educationLabel"],
            rod=rod_suffix(row["genderLabel"])
        )

        if pd.notna(row.get("deathDate")) and row["deathDate"] != 'NaT':
          bio += f" Починал{rod_suffix(row['genderLabel'])} на {row['deathDate']}."

        bios.append(bio)

    df_out = input_df.copy()
    df_out["biography"] = bios
    return df_out

In [8]:
KG = nx.MultiDiGraph()
for _, row in df.iterrows():
    person = row["personLabel"]
    KG.add_node(person, type="Person", gender=row["genderLabel"])
    KG.add_node(row["birthPlaceLabel"], type="BirthPlace")
    KG.add_node(row["occupationLabel"], type="Occupation")
    KG.add_node(row["educationLabel"], type="Education")
    KG.add_edge(person, row["birthPlaceLabel"], relation="born_in")
    KG.add_edge(person, row["occupationLabel"], relation="works_as")
    KG.add_edge(person, row["educationLabel"], relation="studied_at")

In [9]:
def generate_synthetic_people_kg(KG, n=500):
    rows = []
    birth_places = [n for n, d in KG.nodes(data=True) if d.get("type")=="BirthPlace"]
    occupations = [n for n, d in KG.nodes(data=True) if d.get("type")=="Occupation"]
    educations = [n for n, d in KG.nodes(data=True) if d.get("type")=="Education"]
    genders = ["машки", "женски"]

    for i in range(n):
        gender = random.choice(genders)
        row = {
            "personLabel": generate_macedonian_name(gender),
            "genderLabel": gender,
            "birthPlaceLabel": random.choice(birth_places),
            "occupationLabel": random.choice(occupations),
            "educationLabel": random.choice(educations),
            "birthDate": df['birthDate'].sample(1).values[0],
            "deathDate": None
        }
        rows.append(row)
    return pd.DataFrame(rows)

In [10]:
VARIATION_TEMPLATES = [
    "{name} е роден{rod} на {birthDate} во {birthPlace}. По професија е {occupation}. Образованието го стекнал{rod} на {education}.",
    "{name}, роден{rod} во {birthPlace} на {birthDate}, е {occupation}. Образованието го завршил{rod} на {education}.",
    "{name} е {occupation} со образование од {education}. Роден{rod} е на {birthDate} во {birthPlace}.",
    "Роден{rod} на {birthDate} во {birthPlace}, {name} работи како {occupation}. Образованието го стекнал{rod} на {education}.",
    "{name} е {occupation}. Роден{rod} е на {birthDate} во {birthPlace}, образование: {education}.",
]

In [11]:
def post_process_biographies(input_df):
    bios = []
    for _, row in input_df.iterrows():
        template = random.choice(VARIATION_TEMPLATES)
        bio = template.format(
            name=row["personLabel"],
            birthDate=format_date_mk(row["birthDate"]),
            birthPlace=row["birthPlaceLabel"],
            occupation=row["occupationLabel"],
            education=row["educationLabel"],
            rod=rod_suffix(row["genderLabel"])
        )
        if pd.notna(row.get("deathDate")) and row["deathDate"] not in ["NaT", ""]:
            bio += f" Починал{rod_suffix(row['genderLabel'])} на {format_date_mk(row['deathDate'])}."
        bios.append(bio)
    df_out = input_df.copy()
    df_out["biography"] = bios
    return df_out

In [12]:
existing_with_bios = generate_biographies(df)
existing_with_bios["source"] = "original"

In [13]:
synthetic_df = generate_synthetic_people_kg(KG, n=500)
synthetic_with_bios = generate_biographies(synthetic_df)
synthetic_with_bios["source"] = "synthetic"

In [14]:
final_df = pd.concat(
    [existing_with_bios, synthetic_with_bios],
    ignore_index=True)

In [15]:
output_csv = "/content/drive/MyDrive/Vezilka/all_people_with_bios.csv"
final_df.to_csv(output_csv, index=False)

In [16]:
with open("/content/drive/MyDrive/Vezilka/biographies.txt", "w", encoding="utf-8") as f:
    for bio in final_df['biography'].tolist():
        f.write(bio + "\n")