In [None]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [None]:
adult = pd.read_csv("adult.csv", index_col=[0])
adult = adult.replace("?", np.nan)

adult = adult[adult["salary"].isin(["<=50K", ">50K"])]
adult["salary K$"] = adult["salary K$"].astype(str)
adult = adult[adult["salary K$"].str.isnumeric()]

adult

# Task 1
Print the count of men and women in the dataset.

In [1]:
sex_counts = adult["sex"].value_counts()
sex_counts

# Task 2
Find the average age of men in dataset

In [None]:
male = adult[adult["sex"] == "Male"]

average_age_of_men = male["age"].mean()
average_age_of_men

# Task 3
Get the percentage of people from Poland (native-country)

In [None]:
poland_count = (adult["native-country"] == "Poland").sum()

percentage_from_poland = (poland_count / len(adult)) * 100
percentage_from_poland

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [2]:
smaller_salary = adult[adult["salary"] == ">50K"]
mean_age_smaller_salary = smaller_salary["age"].mean()
std_age_smaller_salary = smaller_salary["age"].std()

bigger_salary = adult[adult["salary"] == "<=50K"]
mean_age_bigger_salary = bigger_salary["age"].mean()
std_age_bigger_salary = bigger_salary["age"].std()

mean_age_smaller_salary, std_age_smaller_salary, mean_age_bigger_salary, std_age_bigger_salary

# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [3]:
higher_education = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
high_income = adult[adult["salary"] == ">50K"]
people = high_income[~high_income["education"].isin(higher_education)]
people

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [None]:
education_statistics = adult.groupby("education")["age"].describe()
education_statistics

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [5]:
married_men = adult[(adult["marital-status"].str.startswith("Married")) & (adult["sex"] == "Male")]
married_men_high_income = (married_men["salary"] == ">50K").sum()
married_men_low_income = (married_men["salary"] == "<=50K").sum()

non_married_men = adult[(~adult["marital-status"].str.startswith("Married")) & (adult["sex"] == "Male")]
non_married_men_high_income = (non_married_men["salary"] == ">50K").sum()
non_married_men_low_income = (non_married_men["salary"] == "<=50K").sum()

if married_men_high_income > non_married_men_high_income:
    print("Married men earn more than non-married men in the >50K category.")
elif married_men_high_income < non_married_men_high_income:
    print("Non-married men earn more than married men in the >50K category.")
else:
    print("Married and non-married men earn the same in the >50K category.")

if married_men_low_income > non_married_men_low_income:
    print("Married men earn more than non-married men in the <=50K category.")
elif married_men_low_income < non_married_men_low_income:
    print("Non-married men earn more")

# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [6]:
max_hours_per_week = adult["hours-per-week"].max()
people_with_max_hours = (adult["hours-per-week"] == max_hours_per_week).sum()
max_hours_per_week, people_with_max_hours

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.DataFrame(adult, columns=["age", "hours-per-week", "salary K$"])
correlation_matrix = df.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix Heatmap")
plt.show()