In [8]:
import pandas as pd
import re

# Define the normalize_string function
def normalize_string(s):
    if pd.isna(s):
        return s
    s = s.lower()
    s = re.sub(r'[^a-zа-я0-9\s]', '', s)
    s = re.sub(r'\s+', ' ', s)
    return s.strip()

# Load the CSV file
df = pd.read_csv('./additional_data/city_population.csv')

# Create the key column by concatenating settlement and region
df['key'] = df.apply(lambda x: normalize_string(x['settlement']) + ' ' + normalize_string(x['region']), axis=1)

# Keep only necessary columns
df = df[['id', 'key', 'population']]

# Convert population to numeric, coerce errors to NaN and fill NaNs with 0
df['population'] = pd.to_numeric(df['population'], errors='coerce').fillna(0)

# Group by key and keep the row with the largest population
df = df.loc[df.groupby('key')['population'].idxmax()]

# Reset index
df.reset_index(drop=True, inplace=True)

# Save the cleaned dataframe to a new CSV file
df.to_csv('cleaned_population.csv', index=False)
