In [None]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [None]:
df = pd.read_csv("../data/adult.csv")
df = df[df != "?"].dropna()
mask = ((df["salary"] == "<=50K") & (df["salary K$"] > 50)) | ((df["salary"] == ">50K") & (df["salary K$"] <= 50))

if mask.any():
    print(f"ERROR: Inconsistent data in 'salary' and 'salary K$' columns.\n{df[mask]}")
else:
    print("Data is clean and correct.")


# Task 1
Print the count of men and women in the dataset.

In [None]:
count_by_sex = df["sex"].value_counts()
percent_by_sex = count_by_sex / df.shape[0] * 100
print("Count of men and women:\n", count_by_sex)
print("Percentage of men and women:\n", percent_by_sex)


# Task 2
Find the average age of men in dataset

In [None]:
average_age_men = df.loc[df["sex"] == "Male", "age"].mean()
print(f"Average age of men: {average_age_men:.2f}")


# Task 3
Get the percentage of people from Poland (native-country)

In [None]:
percent_from_poland = (df["native-country"] == "Poland").mean() * 100
print(f"Percentage of people from Poland: {percent_from_poland:.2f}")


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [None]:
mean_age_above_50k = df.loc[df["salary"] == ">50K", "age"].mean()
std_age_above_50k = df.loc[df["salary"] == ">50K", "age"].std()
print(f"Mean age of people earning >50K: {mean_age_above_50k: .2f}")
print(f"Standard deviation of age of people earning >50K: {std_age_above_50k: .2f}")

mean_age_below_50k = df.loc[df["salary"] == "<=50K", "age"].mean()
std_age_below_50k = df.loc[df["salary"] == "<=50K", "age"].std()
print(f"Mean age of people earning <=50K: {mean_age_below_50k: .2f}")
print(f"Standard deviation of age of people earning <=50K: {std_age_below_50k: .2f}")


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [None]:
high_education = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
mask = (df["salary"] == ">50K") & (~df["education"].isin(high_education))
total_number = df[mask].count()["salary"]

if mask.any():
    print(f"There are {total_number} people with >50K salary, but without higher education.")
else:
    print("All people with >50K salary have higher education.")

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [None]:
education_age_stats = df.groupby("education")["age"].describe()
print(education_age_stats)


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [None]:
for salary in [">50K", "<=50K"]:
    income_df = df[(df["sex"] == "Male") & (df["salary"] == salary)]
    married_men = income_df[income_df["marital-status"].str.startswith("Married")]["marital-status"].count()
    non_married_men = income_df["marital-status"].count() - married_men
    message = "Married men" if married_men > non_married_men else "Non-married men"
    percentage = married_men / income_df["marital-status"].count()
    print(f"{message} earn a salary of {salary} more often: {percentage:.2%} vs {1-percentage:.2%}.")


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [None]:
max_hours_per_week = df["hours-per-week"].max()
num_people_max_hours = df[df["hours-per-week"] == max_hours_per_week].shape[0]

print(f"The max hours per week worked by some person is {max_hours_per_week} hours.")
print(f"{num_people_max_hours} people work the same amount of hours per week.")


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [None]:
df_numeric = df.select_dtypes(include=["int", "float"])
df_numeric.iloc[:, 1:].corr()
