In [1]:
data = [    {"name": "Heinrich GALINSKY", "birthdate": "29.01.1928", "country": "POLEN", "nickname": "Henri"},    {"name": "Johann GALINSKY", "birthdate": "15.02.1932", "country": "POLEN", "nickname": "Jean"},    {"name": "Josef GALINSKY", "birthdate": "27.12.1933", "country": "POLEN", "nickname": "Joseph"},    {"name": "Karl GALINSKY", "birthdate": "02.11.1937", "country": "POLEN", "nickname": "Charles"},    {"name": "Wolf GALINSKY", "birthdate": "27.05.1930", "country": "POLEN", "nickname": None}]


In [65]:
import re

def parse_document(document):
    parsed_data = []
    main_data = False
    
    for line in document.split('\n'):
        if main_data:
            current_record = re.match(r'([^\d]+)\s+(\d{2}\.\d{2}\.\d{4}|UNBEKANNT|e\s+ABELE)\s+(.+)', line)
            if current_record:
                surname_and_name, dob, nationality = current_record.groups()
                full_name = surname_and_name.strip()  # Store the full name string before parsing
                surname_and_name = surname_and_name.rsplit(' ', 1)
                
                if len(surname_and_name) == 2:
                    surname, name = surname_and_name[0], surname_and_name[1]
                else:
                    surname, name = surname_and_name[0], ''
                
                if name.startswith('('):
                    name = name[1:]
                
                parsed_data.append({
                    'surname': surname.strip().title(),
                    'name': [name.strip().title()],
                    'dob': dob.strip(),
                    'nationality': nationality.strip(),
                    'full_name': full_name  # Add the full name to the parsed_data dictionary
                })
            else:
                # Additional names
                current_name = re.match(r'\w+', line)
                if current_name and parsed_data:
                    parsed_data[-1]['name'].append(current_name.group().strip())
        
        if line.startswith("--------------------------------------------------------------------------------------------"):
            main_data = True

    return parsed_data

with open('raw_data.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()
    parsed = parse_document(raw_text)


In [66]:
for count, person in enumerate(parsed):
    if 'ewski' in person['full_name'].lower():
        print(person)

{'surname': 'Bartoszewski', 'name': ['Zenon'], 'dob': '01.10.1927', 'nationality': 'POLEN', 'full_name': 'BARTOSZEWSKI Zenon'}
{'surname': 'Berkowicz-Rynarzewski', 'name': ['Else'], 'dob': '24.01.1914', 'nationality': 'STAATENLOS (POLEN)', 'full_name': 'BERKOWICZ-RYNARZEWSKI Else'}
{'surname': 'Chmielewski', 'name': ['Irene'], 'dob': '02.08.1912', 'nationality': 'POLEN', 'full_name': 'CHMIELEWSKI IRENE'}
{'surname': 'Chwaliszewski', 'name': ['Sabina'], 'dob': '03.06.1919', 'nationality': 'POLEN', 'full_name': 'CHWALISZEWSKI Sabina'}
{'surname': 'Chwaliszewski', 'name': ['Zbigniew'], 'dob': '13.05.1906', 'nationality': 'POLEN', 'full_name': 'CHWALISZEWSKI Zbigniew'}
{'surname': 'Cisinski-Miodoszewski', 'name': ['Rosa'], 'dob': '26.12.1908', 'nationality': 'POLEN', 'full_name': 'CISINSKI-MIODOSZEWSKI Rosa'}
{'surname': 'Dluzniewski Jacques', 'name': ['Haim'], 'dob': '03.05.1932', 'nationality': 'FRANKREICH', 'full_name': 'DLUZNIEWSKI Jacques Haim'}
{'surname': 'Dobrzelewski', 'name': ['W

In [67]:
import re
from pydantic import BaseModel, Field, validator
from typing import List

def is_valid_date(date_string: str) -> bool:
    date_pattern = r'^\d{1,2}\.\d{1,2}\.\d{4}$'
    if re.match(date_pattern, date_string):
        return True
    return False

class Name(BaseModel):
    first_name: str
    last_name: str

class Person(BaseModel):
    surname: str
    name: List[str]
    dob: str
    nationality: str
    full_name: str  # Add the new field to the Person class

    @validator("dob")
    def validate_dob(cls, dob):
        if not is_valid_date(dob):
            raise ValueError("Invalid date format")
        return dob

    @validator("name", pre=True)
    def split_name(cls, name):
        if not isinstance(name, list):
            raise ValueError("Name must be a list")
        return [n.strip() for n in name]

parsed = parse_document(raw_text)
valid_entries = 0
invalid_entries = 0

for person_data in parsed:
    try:
        person = Person(**person_data)
        valid_entries += 1
    except ValueError as e:
        invalid_entries += 1
        print("Invalid entry:", person_data)
        print("Error:", e)

print(f"Valid entries: {valid_entries}")
print(f"Invalid entries: {invalid_entries}")


Valid entries: 25476
Invalid entries: 0


In [68]:
import csv

# Define the header for the CSV file
header = ['Surname', 'Name', 'Date of Birth', 'Nationality', 'Full Name']

# Open the CSV file for writing
with open('WW2_Refugees_Registered_in_Geneva.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)

    # Write the header to the CSV file
    writer.writerow(header)

    # Write the parsed data to the CSV file
    for person_data in parsed:
        row = [
            person_data['surname'],
            ', '.join(person_data['name']),
            person_data['dob'],
            person_data['nationality'],
            person_data['full_name']
        ]
        writer.writerow(row)


In [69]:
import pandas as pd

# Create a DataFrame from the parsed data
data = [
    {
        'Surname': person_data['surname'],
        'Name': ', '.join(person_data['name']),
        'Date of Birth': person_data['dob'],
        'Nationality': person_data['nationality'],
        'Full Name': person_data['full_name']
    }
    for person_data in parsed
]
df = pd.DataFrame(data)

# Display the first 10 rows of the DataFrame
df.head(10)

Unnamed: 0,Surname,Name,Date of Birth,Nationality,Full Name
0,Aardewerk,Jansje-Betty,12.02.1924,NIEDERLANDE,AARDEWERK Jansje-Betty
1,Abas,"Rudolf, ABBE",17.07.1918,NIEDERLANDE,ABAS Rudolf
2,Abbondanza,Raimondo,17.12.1922,ITALIEN,ABBONDANZA Raimondo
3,Abbondati,Piero,23.10.1906,ITALIEN,ABBONDATI Piero
4,Abbuhl,Alexandre,16.04.1900,SCHWEIZ,ABBUHL ALEXANDRE
5,Abbuhl,Rose-Marie-Madeleine,01.03.1905,SCHWEIZ,ABBUHL ROSE-MARIE-MADELEINE
6,Abd El Kader Ben,Haous,05.06.1916,ALGERIEN,ABD EL KADER Ben Haous
7,Abdeslam Ben,Abdeslam,01.01.1918,UNBEKANNT,ABDESLAM Ben Abdeslam
8,Abdul-Hadi,Nuraldin,15.06.1917,PALASTINA,ABDUL-HADI NURALDIN
9,Abdul-Hadi-Walther,Marie-Anne,20.03.1918,PALASTINA,ABDUL-HADI-WALTHER MARIE-ANNE
