In [None]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [None]:
filename = "../data/adult.csv"
data = pd.read_csv(filename)

data = data[~data.isin(["?"]).any(axis=1)]

errors = [
    data.loc[(data['salary'] == '<= 50K') & (data['salary K$'] > 50)],
    data.loc[(data['salary'] == '> 50K') & (data['salary K$'] <= 50)],
    data.loc[(data['salary'] == '< 50K') & (data['salary K$'] >= 50)],
    data.loc[(data['salary'] == '>= 50K') & (data['salary K$'] < 50)]
]

all_errors = pd.concat(errors)

# Task 1
Print the count of men and women in the dataset.

In [None]:
male = data[data["sex"] == "Male"].shape[0]
female = data[data["sex"] == "Female"].count()[0]
print(f"Male: {male}, Female: {female}")

# Task 2
Find the average age of men in dataset

In [None]:
male_data = data[data["sex"] == "Male"]
male_data["age"].mean()

# Task 3
Get the percentage of people from Poland (native-country)

In [None]:
country_data = data["native-country"]
poland_people = country_data[country_data == "Poland"]
poland_percentage = (poland_people.count() / country_data.count()) * 100
print(f"{poland_percentage:.2f}")

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [None]:
people_more_50K = data[data["salary"] == ">50K"]
people_less_equal_50K = data[data["salary"] == "<=50K"]

people_more_50K_mean = people_more_50K["age"].mean()
people_more_50K_std = people_more_50K["age"].std()
print(f"People who earn more than 50K mean: {people_more_50K_mean}")
print(f"People who earn more than 50K standard deviation: {people_more_50K_std}")

people_less_equal_50K_mean = people_less_equal_50K["age"].mean()
people_less_equal_50K_std = people_less_equal_50K["age"].std()
print(f"People who earn less or equal than 50K mean: {people_less_equal_50K_mean}")
print(f"People who earn less or equal than 50K standard deviation: {people_less_equal_50K_std}")


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [None]:
people_without_education = people_more_50K[~people_more_50K["education"].isin(
    ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
)]
people_without_education

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [None]:
education = data.groupby(by="education")
education["age"].describe()

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [None]:
married_mask = male_data["marital-status"].str.startswith("Married-civ-spouse")
married_men = male_data[married_mask]
non_married_men = male_data[~married_mask]

married_salary = married_men["salary"].value_counts(normalize=True) * 100
non_married_salary = non_married_men["salary"].value_counts(normalize=True) * 100

print(f"Married (%): {married_salary}")
print(f"Non-married (%): {non_married_salary} %")

# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [None]:
hours_per_week = data["hours-per-week"]
max_hours = hours_per_week.max()

hours_per_week[hours_per_week == max_hours].count()

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [None]:
corr = data.corr()

strong_corr = corr[(corr > 0.15) | (corr < -0.15)]
print("Correlations:\n", strong_corr)