In [1]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [2]:
import os

adult_df = pd.read_csv(os.path.join("..", "data", "adult.csv"))

adult_df.replace("?", pd.NA, inplace=True)
adult_df = adult_df.dropna()

salary_over_50k = adult_df["salary K$"] > 50
adults_over_50k = adult_df[adult_df["salary"] == ">50K"]
salary_below_50k = adult_df["salary K$"] <= 50
adults_below_50k = adult_df[adult_df["salary"] == "<=50K"]

adults_df = pd.merge(adults_over_50k, adults_below_50k, how="outer")
adults_df.head()

Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
0,0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K,39
1,1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K,35
2,2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K,27
3,3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K,43
4,4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K,25


# Task 1
Print the count of men and women in the dataset.

In [3]:
male_amount = (adults_df["sex"] == "Male").sum()
female_amount = (adults_df["sex"] == "Female").sum()

# Task 2
Find the average age of men in dataset

In [4]:
average_men_age = adults_df[adults_df["sex"] == "Male"]["age"].mean()

# Task 3
Get the percentage of people from Poland (native-country)

In [5]:
amount_of_poles = (adults_df["native-country"] == "Poland").sum()
percentage = (amount_of_poles / len(adults_df)) * 100


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [6]:
avg_age_over = adults_over_50k["age"].mean()
avg_age_below = adults_below_50k["age"].mean()

std_age_over = adults_over_50k["age"].std()
std_age_below = adults_below_50k["age"].std()


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [7]:
degrees = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
hustlers = adults_over_50k[~adults_over_50k["education"].isin(degrees)]

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [8]:
education_age_stats = adults_df.groupby("education")["age"].describe()

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [9]:
men = adults_df["sex"] == "Male"
married = adults_df["marital-status"].str.startswith("Married")

married_men = adults_df[men & married]
non_married_men = adults_df[men & ~married]

avg_married_salary = married_men["salary K$"].describe()
avg_non_married_salary = non_married_men["salary K$"].describe()


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [10]:
hour_groups = adults_df.groupby("hours-per-week").size().reset_index(name="count")

max_hour_amount = hour_groups.loc[hour_groups["hours-per-week"].idxmax()]
max_hour_group = hour_groups.loc[hour_groups["count"].idxmax()]


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [26]:
adults_encoded = pd.get_dummies(adults_df)

def get_adults_encoded_mask(regex_filter: str) -> pd.DataFrame:
    return adults_encoded.filter(regex=regex_filter)

def get_concat_salary_mask(columns: pd.Series | pd.DataFrame) -> pd.DataFrame:
    return pd.concat([columns, adults_encoded["salary K$"]], axis=1)

workclass_columns = get_adults_encoded_mask("^workclass_")
education_columns = get_adults_encoded_mask("^education_")

workclass_salary = get_concat_salary_mask(workclass_columns)
education_salary = get_concat_salary_mask(education_columns)

age_salary = get_concat_salary_mask(adults_encoded["age"])

age_salary.corr()

Unnamed: 0,age,salary K$
age,1.0,0.208203
salary K$,0.208203,1.0
