In [1]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [2]:
df = pd.read_csv("../data/adult.csv")
df = df[df != "?"].dropna()
mask = ((df["salary"] == "<=50K") & (df["salary K$"] > 50)) | ((df["salary"] == ">50K") & (df["salary K$"] <= 50))

if mask.any():
    print(f"ERROR: Inconsistent data in 'salary' and 'salary K$' columns.\n{df[mask]}")
else:
    print("Data is clean and correct.")


Data is clean and correct.


# Task 1
Print the count of men and women in the dataset.

In [3]:
count_by_sex = df["sex"].value_counts()
percent_by_sex = count_by_sex / df.shape[0] * 100
print("Count of men and women:\n", count_by_sex)
print("Percentage of men and women:\n", percent_by_sex)


Count of men and women:
 sex
Male      20380
Female     9782
Name: count, dtype: int64
Percentage of men and women:
 sex
Male      67.568464
Female    32.431536
Name: count, dtype: float64


# Task 2
Find the average age of men in dataset

In [4]:
average_age_men = df.loc[df["sex"] == "Male", "age"].mean()
print(f"Average age of men: {average_age_men:.2f}")


Average age of men: 39.18


# Task 3
Get the percentage of people from Poland (native-country)

In [5]:
percent_from_poland = (df["native-country"] == "Poland").mean() * 100
print(f"Percentage of people from Poland: {percent_from_poland:.2f}")


Percentage of people from Poland: 0.19


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [6]:
mean_age_above_50k = df.loc[df["salary"] == ">50K", "age"].mean()
std_age_above_50k = df.loc[df["salary"] == ">50K", "age"].std()
print(f"Mean age of people earning >50K: {mean_age_above_50k: .2f}")
print(f"Standard deviation of age of people earning >50K: {std_age_above_50k: .2f}")

mean_age_below_50k = df.loc[df["salary"] == "<=50K", "age"].mean()
std_age_below_50k = df.loc[df["salary"] == "<=50K", "age"].std()
print(f"Mean age of people earning <=50K: {mean_age_below_50k: .2f}")
print(f"Standard deviation of age of people earning <=50K: {std_age_below_50k: .2f}")


Mean age of people earning >50K:  43.96
Standard deviation of age of people earning >50K:  10.27
Mean age of people earning <=50K:  36.61
Standard deviation of age of people earning <=50K:  13.46


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [7]:
high_education = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
mask = (df["salary"] == ">50K") & (~df["education"].isin(high_education))
total_number = df[mask].count()["salary"]

if mask.any():
    print(f"There are {total_number} people with >50K salary, but without higher education.")
else:
    print("All people with >50K salary have higher education.")

There are 3178 people with >50K salary, but without higher education.


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [8]:
education_age_stats = df.groupby("education")["age"].describe()
print(education_age_stats)


               count       mean        std   min   25%   50%   75%   max
education                                                               
10th           820.0  37.897561  16.225795  17.0  23.0  36.0  52.0  90.0
11th          1048.0  32.363550  15.089307  17.0  18.0  28.5  43.0  90.0
12th           377.0  32.013263  14.373710  17.0  19.0  28.0  41.0  79.0
1st-4th        151.0  44.622517  14.929051  19.0  33.0  44.0  56.0  81.0
5th-6th        288.0  41.649306  14.754622  17.0  28.0  41.0  53.0  82.0
7th-8th        557.0  47.631957  15.737479  17.0  34.0  49.0  60.0  90.0
9th            455.0  40.303297  15.335754  17.0  28.0  38.0  53.0  90.0
Assoc-acdm    1008.0  37.286706  10.509755  19.0  29.0  36.0  44.0  90.0
Assoc-voc     1307.0  38.246366  11.181253  19.0  30.0  37.0  45.0  84.0
Bachelors     5044.0  38.641554  11.577566  19.0  29.0  37.0  46.0  90.0
Doctorate      375.0  47.130667  11.471727  24.0  39.0  47.0  54.0  80.0
HS-grad       9840.0  38.640955  13.067730  17.0  2

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [9]:
for salary in [">50K", "<=50K"]:
    income_df = df[(df["sex"] == "Male") & (df["salary"] == salary)]
    married_men = income_df[income_df["marital-status"].str.startswith("Married")]["marital-status"].count()
    non_married_men = income_df["marital-status"].count() - married_men
    message = "Married men" if married_men > non_married_men else "Non-married men"
    percentage = married_men / income_df["marital-status"].count()
    print(f"{message} earn a salary of {salary} more often: {percentage:.2%} vs {1-percentage:.2%}.")


Married men earn a salary of >50K more often: 89.48% vs 10.52%.
Married men earn a salary of <=50K more often: 50.43% vs 49.57%.


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [10]:
max_hours_per_week = df["hours-per-week"].max()
num_people_max_hours = df[df["hours-per-week"] == max_hours_per_week].shape[0]

print(f"The max hours per week worked by some person is {max_hours_per_week} hours.")
print(f"{num_people_max_hours} people work the same amount of hours per week.")


The max hours per week worked by some person is 99 hours.
78 people work the same amount of hours per week.


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [11]:
df_numeric = df.select_dtypes(include=["int", "float"])
df_numeric.iloc[:, 1:].corr()


Unnamed: 0,age,hours-per-week,salary K$
age,1.0,0.101599,0.208203
hours-per-week,0.101599,1.0,0.196378
salary K$,0.208203,0.196378,1.0
