In [2]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [3]:
file_path = r"..\data\adult.csv"
data = pd.read_csv(file_path)

data.replace("?", np.nan, inplace=True)
data.dropna(inplace=True)

salary_is_correct = len(data[~data.salary.isin(["<=50K", ">50K"])]) == 0
salary_k_is_correct = len(data[data["salary K$"] <= 0]) == 0

salary_is_correct, salary_k_is_correct


(True, True)

# Task 1
Print the count of men and women in the dataset.

In [4]:
data["sex"].value_counts()

sex
Male      20380
Female     9782
Name: count, dtype: int64

# Task 2
Find the average age of men in dataset

In [5]:
data[data["sex"] == "Male"]["age"].mean()

39.18400392541707

# Task 3
Get the percentage of people from Poland (native-country)

In [6]:
polish_count = len((data[data["native-country"] == "Poland"]))
total_count = len(data)

polish_percentage = polish_count / total_count * 100
print(f"Percentage of polish citizens : {round(polish_percentage, 2)} %")

Percentage of polish citizens : 0.19 %


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [7]:
gt_50k_data = data[data["salary"] == ">50K"]
mean_age_gt_50k = gt_50k_data["age"].mean()
std_age_gt_50k = gt_50k_data["age"].std()

print(f"50k earners mean age: {mean_age_gt_50k}")
print(f"50k earners std age: {std_age_gt_50k}")

le_50k_data = data[data["salary"] == "<=50K"]
mean_age_le_50k = le_50k_data["age"].mean()
str_age_le_50k = le_50k_data["age"].std()

print(f"less than 50k earners mean age: {mean_age_le_50k}")
print(f"less than 50k earners std age: {str_age_le_50k}")


50k earners mean age: 43.95911028236548
50k earners std age: 10.269632835673852
less than 50k earners mean age: 36.60806038668668
less than 50k earners std age: 13.464631257161633


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [8]:
higher_education = ["Bachelors", "Masters", "Assoc-acdm", "Assoc-voc", "Doctorate"]
high_education_mask = data["education"].isin(higher_education)
high_salary_mask = data["salary"] == ">50K"

len(data[~high_education_mask & high_salary_mask])


3584

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [9]:
data.groupby("education")["age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10th,820.0,37.897561,16.225795,17.0,23.0,36.0,52.0,90.0
11th,1048.0,32.36355,15.089307,17.0,18.0,28.5,43.0,90.0
12th,377.0,32.013263,14.37371,17.0,19.0,28.0,41.0,79.0
1st-4th,151.0,44.622517,14.929051,19.0,33.0,44.0,56.0,81.0
5th-6th,288.0,41.649306,14.754622,17.0,28.0,41.0,53.0,82.0
7th-8th,557.0,47.631957,15.737479,17.0,34.0,49.0,60.0,90.0
9th,455.0,40.303297,15.335754,17.0,28.0,38.0,53.0,90.0
Assoc-acdm,1008.0,37.286706,10.509755,19.0,29.0,36.0,44.0,90.0
Assoc-voc,1307.0,38.246366,11.181253,19.0,30.0,37.0,45.0,84.0
Bachelors,5044.0,38.641554,11.577566,19.0,29.0,37.0,46.0,90.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [10]:
valid_gt_50k = data[data["salary"] == ">50K"]["salary K$"] > 50
valid_le_50k = data[data["salary"] == "<=50K"]["salary K$"] <= 50

invalid_gt_50k_count = len(valid_gt_50k[valid_gt_50k == False])
invalid_le_50k_count = len(valid_le_50k[valid_le_50k == False])

if invalid_gt_50k_count != 0 or invalid_le_50k_count != 0:
    raise ValueError("Wrong salary data")

high_income_married_men_count = len(data[(data["sex"] == "Male") & (data["salary"] == ">50K") & (data["marital-status"].str.startswith("Married"))])
high_income_unmarried_men_count = len(data[(data["sex"] == "Male") & (data["salary"] == ">50K") & (~data["marital-status"].str.startswith("Married"))])

low_income_married_men_count = len(data[(data["sex"] == "Male") & (data["salary"] == "<=50K") & (data["marital-status"].str.startswith("Married"))])
low_income_unmarried_men_count = len(data[(data["sex"] == "Male") & (data["salary"] == "<=50K") & (~data["marital-status"].str.startswith("Married"))])

print(
    f"Number of high-income married men: {high_income_married_men_count},"
    f"Number of high-income unmarried men: {high_income_unmarried_men_count},"
    f"Number of low-income married men: {low_income_married_men_count},"
    f"Number of low-income unmarried men: {low_income_unmarried_men_count}"
)


Number of high-income married men: 5723,Number of high-income unmarried men: 673,Number of low-income married men: 7052,Number of low-income unmarried men: 6932


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [11]:
max_work_hours = data["hours-per-week"].max()
print(f"Max work hours: {max_work_hours}")
max_work_hours_people_count = len(data[data["hours-per-week"] == max_work_hours])
print(f"Number of workers working for {max_work_hours} hours a week is {max_work_hours_people_count}")

Max work hours: 99
Number of workers working for 99 hours a week is 78


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [12]:
data_encoded = pd.get_dummies(data)
correlation_matrix = data_encoded.corr()

strong_correlations = (correlation_matrix.abs() > 0.5) & (correlation_matrix.abs() < 1)

strong_correlation_pairs = [(col1, col2) for col1 in strong_correlations.columns for col2 in strong_correlations.columns if strong_correlations.loc[col1, col2]]

print("Strong Correlations:")
for col1, col2 in strong_correlation_pairs:
    print(f"{col1} - {col2}: {correlation_matrix.loc[col1, col2]}")

Strong Correlations:
age - marital-status_Never-married: -0.5240030196960974
salary K$ - salary_<=50K: -0.8538940815506783
salary K$ - salary_>50K: 0.8538940815506784
workclass_Private - workclass_Self-emp-not-inc: -0.5055879319206624
workclass_Self-emp-not-inc - workclass_Private: -0.5055879319206624
marital-status_Married-civ-spouse - marital-status_Never-married: -0.6448616970319305
marital-status_Married-civ-spouse - relationship_Husband: 0.8965024645436726
marital-status_Married-civ-spouse - relationship_Not-in-family: -0.5464005298431114
marital-status_Never-married - age: -0.5240030196960974
marital-status_Never-married - marital-status_Married-civ-spouse: -0.6448616970319305
marital-status_Never-married - relationship_Husband: -0.57890342430291
relationship_Husband - marital-status_Married-civ-spouse: 0.8965024645436726
relationship_Husband - marital-status_Never-married: -0.57890342430291
relationship_Husband - sex_Female: -0.5812208858275799
relationship_Husband - sex_Male: 0