# Data cleaning and merging for the Viz Project

In [None]:
import pandas as pd

In [None]:
dogs = pd.read_csv('hunde-vie.csv', sep=';', skiprows=1)
population_density = pd.read_csv('vie-bez-biz-pop-den-2002f.csv', sep=';', skiprows=1)
population = pd.read_csv('vie-bez-pop-sex-age5-stk-ori-geo4-2002f.csv', sep=';', skiprows=1)
dog_breeds = pd.read_csv('dog_breeds.csv', sep=',', skiprows=0)
breeds_mapping = pd.read_csv('dog_mappings_two.csv', sep=',', skiprows=0)


In [None]:
dogs = dogs.drop(columns=["NUTS1", "NUTS2", "NUTS3", "SUB_DISTRICT_CODE", "Postal_CODE", "Ref_Date"])
dogs.head()

In [None]:
dogs["Breed_single"] = dogs["Dog Breed"].str.split("/").str[0].str.strip()
dogs["Breed_single"].unique()

In [None]:
dogs_grouped = dogs.groupby(["DISTRICT_CODE", "Breed_single"])["Anzahl"].sum().reset_index()

# breeds = dogs_grouped.groupby("Breed_single", as_index = False).sum()
# breeds["Breed_single"].to_csv('dog_breed_german_names.csv', index = False)

In [None]:
population_density = population_density[population_density["REF_YEAR"] == 2012]
population_density = population_density.loc[:, ~population_density.columns.str.contains("^Unnamed")]
population_density = population_density.drop(columns=["NUTS", "SUB_DISTRICT_CODE", "REF_YEAR", "REF_DATE"])
population_density.head()


In [None]:
population.head()

In [None]:
# 1. Filter by year
df = population[population["REF_YEAR"] == 2012].copy()

# 2. Drop sex
df = df.drop(columns=["SEX"])

# 3. Group by region/date/age and sum over populations
grouped = df.groupby(
    ["NUTS", "DISTRICT_CODE", "SUB_DISTRICT_CODE", "REF_YEAR", "REF_DATE", "AGE5"]
).sum().reset_index()

# 4. Total population per row (all groups)
grouped["TOTAL"] = grouped[["AUT", "EEA", "REU", "TCN"]].sum(axis=1)

# 5. Weighted average age per region/date
def weighted_avg_age(group):
    return (group["AGE5"] * group["TOTAL"] * 5).sum() / group["TOTAL"].sum()

avg_age = grouped.groupby(
    ["NUTS", "DISTRICT_CODE", "SUB_DISTRICT_CODE", "REF_YEAR", "REF_DATE"]
).apply(weighted_avg_age).reset_index(name="AVG_AGE")

# 6. AUT share vs. others
def aut_share(group):
    aut = group["AUT"].sum()
    others = group[["EEA", "REU", "TCN"]].sum().sum()
    return aut / (aut + others)

aut_ratio = grouped.groupby(
    ["NUTS", "DISTRICT_CODE", "SUB_DISTRICT_CODE", "REF_YEAR", "REF_DATE"]
).apply(aut_share).reset_index(name="AUT_RATIO")

# 7. Merge results
population_reshape = avg_age.merge(aut_ratio, on=["NUTS", "DISTRICT_CODE", "SUB_DISTRICT_CODE", "REF_YEAR", "REF_DATE"])
population_reshape = population_reshape.drop(columns=["NUTS", "SUB_DISTRICT_CODE", "REF_YEAR", "REF_DATE"])
population_reshape


In [None]:
dogs_pop = dogs_grouped.merge(population_density).merge(population_reshape)

In [None]:
full_breeds = breeds_mapping.merge(dog_breeds, left_on=["English name"], right_on=["Breed Name"], how='left')
full_breeds2 = breeds_mapping.merge(dog_breeds, left_on=["English name"], right_on=["Breed Name"], how='outer')
# full_breeds2.to_csv('breed_info.csv', index = False)

In [None]:
dogs_pop_breed = dogs_pop.merge(full_breeds, left_on=["Breed_single"], right_on=["German Breed"], how='left')

In [None]:

dogs_pop_breed
dogs_pop_breed['POP_DENSITY'] = dogs_pop_breed['POP_DENSITY'].str.replace(',', '.').astype('float64')
dogs_pop_breed['POP_VALUE'] = dogs_pop_breed['POP_VALUE'].str.replace(',', '.').astype('float64')
dogs_pop_breed['AREA'] = dogs_pop_breed['AREA'].str.replace(',', '.').astype('float64')
dogs_pop_breed['AVG_AGE'] = dogs_pop_breed['AVG_AGE'].round(1)
dogs_pop_breed['AUT_RATIO'] = dogs_pop_breed['AUT_RATIO'].round(3)

In [None]:
dogs_pop_breed_der = dogs_pop_breed.assign(DOG_DENSITY=lambda x: x['Anzahl'] / x['POP_VALUE'])
dogs_pop_breed_der['DOG_DENSITY'] = dogs_pop_breed_der['DOG_DENSITY'].round(2)

In [None]:
print(dogs_pop_breed_der.columns)

In [None]:
dogs_pop_breed_der = dogs_pop_breed_der.rename(columns={
    "DISTRICT_CODE": "district_code",
    "English name": "dog_breed",
    "Dog Breed Group": "dog_breed_group",
    "Anzahl": "dog_count",
    "POP_VALUE": "population",
    "POP_DENSITY": "population_density",
    "AREA": "area_km2",
    "AVG_AGE": "avg_age",
    "Dog Size": "dog_size",
    "Adaptability": "adaptability",
    "All Around Friendliness": "friendliness",
    "Health And Grooming Needs": "health_needs",
    "Trainability": "trainability",
    "Exercise Needs": "exercise_needs",
    "DOG_DENSITY": "dog_density"
})

dogs_pop_breed_der = dogs_pop_breed_der.groupby(["district_code", "dog_breed"]).agg({
    "dog_breed_group": "first",
    "dog_count": "sum",
    "population": "first",
    "population_density": "first",
    "area_km2": "first",
    "avg_age": "first",
    "dog_size": "first",
    "adaptability": "first",
    "friendliness": "first",
    "health_needs": "first",
    "trainability": "first",
    "exercise_needs": "first",
    "dog_density": "sum"
}).reset_index()


In [None]:
dogs_clean = dogs_pop_breed_der[[
    "district_code",
    "dog_breed",
    "dog_breed_group",
    "dog_count",
    "population",
    "population_density",
    "area_km2",
    "avg_age",
    "dog_size",
    "adaptability",
    "friendliness",
    "health_needs",
    "trainability",
    "exercise_needs",
    "dog_density"
]]


In [None]:
dogs_clean

In [None]:
dogs_clean.to_csv('dogs_in_vienna.csv', index = False)

In [None]:
dogs_clean[dogs_clean["dog_breed_group"].isnull()].groupby("dog_breed").sum("dog_count").sort_values(by="dog_count", ascending=False)