In [152]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [153]:
people_df = pd.read_csv("../data/adult.csv", index_col=0)
people_df.index.name = "Person id"
people_df = people_df.replace("?", pd.NA).dropna()
people_df[["salary", "salary K$"]].where(people_df["salary K$"] == 50).where(people_df["salary"] != "50K").dropna()
mask1 = (people_df["salary"] == ">50K") & (people_df["salary K$"] > 50)
mask2 = (people_df["salary"] == "<=50K") & (people_df["salary K$"] <= 50)
all_mask = mask1 | mask2
people_df.loc[~all_mask & (people_df["salary"] == ">50K"), "salary"] = "<=50K"
people_df.loc[~all_mask & (people_df["salary"] == "<=50K"), "salary"] = ">50K"
all_mask

Person id
0        True
1        True
2        True
3        True
4        True
         ... 
32556    True
32557    True
32558    True
32559    True
32560    True
Length: 30162, dtype: bool

# Task 1
Print the count of men and women in the dataset.

In [154]:
people_df["sex"].value_counts()

sex
Male      20380
Female     9782
Name: count, dtype: int64

# Task 2
Find the average age of men in dataset

In [155]:
people_df[people_df["sex"] == "Male"]["age"].mean()

39.18400392541707

# Task 3
Get the percentage of people from Poland (native-country)

In [156]:
lives_in_poland = (people_df["native-country"] == "Poland").sum()
all_people = len(people_df)
(lives_in_poland / all_people) * 100

0.18566408063125786

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [157]:
mean_age_poorer_people = people_df[people_df["salary"] == "<=50K"]["age"].mean()
std_by_age_poorer_people = people_df[people_df["salary"] == "<=50K"]["age"].std()
mean_age_richer_people = people_df[people_df["salary"] == ">50K"]["age"].mean()
std_by_age_richer_people = people_df[people_df["salary"] == ">50K"]["age"].std()
mean_age_poorer_people, std_by_age_poorer_people, mean_age_richer_people, std_by_age_richer_people

(36.60806038668668, 13.464631257161633, 43.95911028236548, 10.269632835673852)

# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [158]:
mask = ~people_df["education"].isin(["Bachelors", "Masters", "Prof-school", "Assoc-acdm", "Assoc-voc", "Doctorate"])
people_df[mask]

Unnamed: 0_level_0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
Person id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K,27
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K,43
6,49,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,16,Jamaica,<=50K,49
7,52,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,>50K,307
10,37,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,Black,Male,80,United-States,>50K,116
...,...,...,...,...,...,...,...,...,...,...,...,...
32555,22,Private,Some-college,Never-married,Protective-serv,Not-in-family,White,Male,40,United-States,<=50K,32
32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K,173
32558,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States,<=50K,40
32559,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,20,United-States,<=50K,38


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [159]:
people_df.groupby(people_df["education"])["age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10th,820.0,37.897561,16.225795,17.0,23.0,36.0,52.0,90.0
11th,1048.0,32.36355,15.089307,17.0,18.0,28.5,43.0,90.0
12th,377.0,32.013263,14.37371,17.0,19.0,28.0,41.0,79.0
1st-4th,151.0,44.622517,14.929051,19.0,33.0,44.0,56.0,81.0
5th-6th,288.0,41.649306,14.754622,17.0,28.0,41.0,53.0,82.0
7th-8th,557.0,47.631957,15.737479,17.0,34.0,49.0,60.0,90.0
9th,455.0,40.303297,15.335754,17.0,28.0,38.0,53.0,90.0
Assoc-acdm,1008.0,37.286706,10.509755,19.0,29.0,36.0,44.0,90.0
Assoc-voc,1307.0,38.246366,11.181253,19.0,30.0,37.0,45.0,84.0
Bachelors,5044.0,38.641554,11.577566,19.0,29.0,37.0,46.0,90.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [160]:
mans = people_df[(people_df["sex"] == "Male")]
maried_mans = mans[mans["marital-status"].str.startswith("Married")]
not_maried_mans = mans[~mans["marital-status"].str.startswith("Married")]
avg_salary_maried_mans = maried_mans["salary K$"].mean()
avg_salary_not_maried_mans = not_maried_mans["salary K$"].mean()
percentage_difference = ((avg_salary_maried_mans - avg_salary_not_maried_mans) / avg_salary_not_maried_mans) * 100
f"Married mans earns more money on {percentage_difference}%"



'Married mans earns more money on 130.68869053389844%'

# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [161]:
work_max_hours_a_week = people_df["hours-per-week"].max()
count_hardworking_people = len(people_df[people_df["hours-per-week"] == work_max_hours_a_week])
f"{count_hardworking_people} peoples works {work_max_hours_a_week} hours per week"

'78 peoples works 99 hours per week'

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [162]:
people_df["salary"].replace("<=50K", 0, inplace=True)
people_df["salary"].replace(">50K", 1, inplace=True)

people_df["sex"].replace("Male", 1, inplace=True)
people_df["sex"].replace("Female", 0, inplace=True)

people_df["race"].replace({
    "White": 1,
    "Black": 2,
    "Asian-Pac-Islander": 3,
    "Amer-Indian-Eskimo": 4,
    "Other": 0
}, inplace=True)

country_percentages = (people_df.groupby("native-country").size() / len(people_df)) * 100
people_df["native-country"] = people_df["native-country"].map(country_percentages)

have_higher_edu = ~people_df["education"].isin(["Bachelors", "Masters", "Prof-school", "Assoc-acdm", "Assoc-voc", "Doctorate"])
people_df["education"] = have_higher_edu.astype(int)

maried = people_df[people_df["marital-status"].str.startswith("Married")]
people_df["marital-status"] = have_higher_edu.astype(int)

people_df["workclass"].replace({
    "Private": 1,
    "Self-emp-not-inc": 2,
    "Self-emp-inc": 3,
    "Federal-gov": 4,
    "Local-gov": 5,
    "State-gov": 6,
    "Without-pay": 0,
}, inplace=True)

df_corr = people_df[["age", "workclass", "education", "sex", "marital-status", "race", "salary", "native-country", "hours-per-week", "salary K$"]].corr()

df_corr


Unnamed: 0,age,workclass,education,sex,marital-status,race,salary,native-country,hours-per-week,salary K$
age,1.0,0.134265,-0.07876,0.081993,-0.07876,-0.014013,0.241998,0.015795,0.101599,0.208203
workclass,0.134265,1.0,-0.15238,-0.005175,-0.15238,0.039882,0.087176,0.050982,0.029121,0.072027
education,-0.07876,-0.15238,1.0,-0.024537,1.0,0.011776,-0.304504,-0.010728,-0.138793,-0.254759
sex,0.081993,-0.005175,-0.024537,1.0,-0.024537,-0.07071,0.216699,-0.00269,0.231268,0.182642
marital-status,-0.07876,-0.15238,1.0,-0.024537,1.0,0.011776,-0.304504,-0.010728,-0.138793,-0.254759
race,-0.014013,0.039882,0.011776,-0.07071,0.011776,1.0,-0.05262,-0.201452,-0.038908,-0.044136
salary,0.241998,0.087176,-0.304504,0.216699,-0.304504,-0.05262,1.0,0.039793,0.22948,0.853894
native-country,0.015795,0.050982,-0.010728,-0.00269,-0.010728,-0.201452,0.039793,1.0,0.010669,0.033647
hours-per-week,0.101599,0.029121,-0.138793,0.231268,-0.138793,-0.038908,0.22948,0.010669,1.0,0.196378
salary K$,0.208203,0.072027,-0.254759,0.182642,-0.254759,-0.044136,0.853894,0.033647,0.196378,1.0
