In [None]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [None]:
data_set = pd.read_csv("../data/adult.csv")
data_set = data_set.replace("?", np.nan)
data_set["salary K$"] = np.where(data_set["salary K$"] <= 0, np.nan, data_set["salary K$"])
data_set = data_set.dropna()

data_set["salary"] = np.where(data_set["salary K$"] <= 50, "<=50K", ">50K")
data_set.head()

# Task 1
Print the count of men and women in the dataset.

In [None]:
data_set["sex"].value_counts()

# Task 2
Find the average age of men in dataset

In [None]:
data_set[data_set["sex"] == "Male"]["age"].mean()

# Task 3
Get the percentage of people from Poland (native-country)

In [None]:
((data_set["native-country"] == "Poland").sum() / len(data_set)) * 100

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [None]:
data_set[data_set["salary"] == ">50K"]["age"].mean()

In [None]:
data_set[data_set["salary"] == ">50K"]["age"].std()

In [None]:
data_set[data_set["salary"] == "<=50K"]["age"].mean()

In [None]:
data_set[data_set["salary"] == "<=50K"]["age"].std()

# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [None]:
data_set[~(data_set["education"].isin(("Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"))) & (data_set["salary"] == ">50K")]

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [None]:
data_set.groupby("education")["age"].describe()

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [None]:
married_men = data_set[(data_set["sex"] == "Male") & data_set["marital-status"].str.startswith("Married")]
not_married_men = data_set[(data_set["sex"] == "Male") & ~data_set["marital-status"].str.startswith("Married")]

married_men_more = (married_men["salary"] == ">50K").sum()
married_men_less = (married_men["salary"] == "<=50K").sum()
not_married_men_more = (not_married_men["salary"] == ">50K").sum()
not_married_men_less = (not_married_men["salary"] == "<=50K").sum()

compare_dict = {
    "married_men_less": married_men_less,
    "married_men_more": married_men_more,
    "not_married_men_less": not_married_men_less,
    "not_married_men_more": not_married_men_more
}

pd.Series(compare_dict)

# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [None]:
(data_set["hours-per-week"] == data_set["hours-per-week"].max()).sum()

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

correlation_matrix = data_set[["age", "hours-per-week", "salary K$"]].corr()

plt.figure(figsize=(12, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()