In [1]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [105]:
df = pd.read_csv("../data/adult.csv")
clean_df = df.replace("?", np.nan).dropna()


less_than_fifty_k = clean_df[clean_df["salary K$"] <= 50].shape[0]
less_than_fifty_k_records = clean_df[clean_df["salary"] == "<=50K"].shape[0]

if less_than_fifty_k == less_than_fifty_k_records:
    print("Amount of <=50K records is correct")

more_than_fifty_k = clean_df[clean_df["salary K$"] > 50].shape[0]
more_than_fifty_k_records = clean_df[clean_df["salary"] == ">50K"].shape[0]

if more_than_fifty_k == more_than_fifty_k_records:
    print("Amount of >50K records is correct")

clean_df.head()

Amount of <=50K records is correct
Amount of >50K records is correct


Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
0,0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K,39
1,1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K,35
2,2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K,27
3,3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K,43
4,4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K,25


# Task 1
Print the count of men and women in the dataset.

In [3]:
clean_df["sex"].value_counts()

sex
Male      20380
Female     9782
Name: count, dtype: int64

# Task 2
Find the average age of men in dataset

In [4]:
clean_df[clean_df["sex"] == "Male"]["age"].mean()

39.18400392541707

# Task 3
Get the percentage of people from Poland (native-country)

In [5]:
polish = clean_df[clean_df["native-country"] == "Poland"].shape[0]
percentage_of_polish = polish / len(clean_df) * 100
percentage_of_polish


0.18566408063125786

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [6]:
people_more_than_fifty = clean_df[clean_df["salary"] == ">50K"]
people_more_age = people_more_than_fifty["age"]
more_than_fifty_mean = people_more_age.mean()
more_than_fifty_std = people_more_age.std()

print(f"> 50K mean: {more_than_fifty_mean}, std: {more_than_fifty_std}")


people_less_than_fifty = clean_df[clean_df["salary"] == "<=50K"]
people_less_age = people_less_than_fifty["age"]
less_than_fifty_mean = people_less_age.mean()
less_than_fifty_std = people_less_age.std()

print(f"<= 50K mean: {less_than_fifty_mean}, std: {less_than_fifty_std}")

> 50K mean: 43.95911028236548, std: 10.269632835673852
<= 50K mean: 36.60806038668668, std: 13.464631257161633


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [7]:
high_ed = pd.Series(["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"])
people_without_high_ed = clean_df[~clean_df["education"].isin(high_ed)] 
people_with_high_salary = people_without_high_ed[people_without_high_ed["salary"] == ">50K"]

people_with_high_salary

Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
7,7,52,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,>50K,307
10,10,37,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,Black,Male,80,United-States,>50K,116
55,55,43,Private,Some-college,Married-civ-spouse,Tech-support,Husband,White,Male,40,United-States,>50K,341
67,67,53,Private,HS-grad,Married-civ-spouse,Adm-clerical,Wife,White,Female,40,United-States,>50K,225
68,68,49,Self-emp-inc,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,50,United-States,>50K,194
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32462,32462,48,Self-emp-inc,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,50,United-States,>50K,343
32518,32518,57,Local-gov,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,40,United-States,>50K,116
32519,32519,46,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,48,United-States,>50K,239
32557,32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K,173


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [8]:
clean_df.groupby("education")["age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10th,820.0,37.897561,16.225795,17.0,23.0,36.0,52.0,90.0
11th,1048.0,32.36355,15.089307,17.0,18.0,28.5,43.0,90.0
12th,377.0,32.013263,14.37371,17.0,19.0,28.0,41.0,79.0
1st-4th,151.0,44.622517,14.929051,19.0,33.0,44.0,56.0,81.0
5th-6th,288.0,41.649306,14.754622,17.0,28.0,41.0,53.0,82.0
7th-8th,557.0,47.631957,15.737479,17.0,34.0,49.0,60.0,90.0
9th,455.0,40.303297,15.335754,17.0,28.0,38.0,53.0,90.0
Assoc-acdm,1008.0,37.286706,10.509755,19.0,29.0,36.0,44.0,90.0
Assoc-voc,1307.0,38.246366,11.181253,19.0,30.0,37.0,45.0,84.0
Bachelors,5044.0,38.641554,11.577566,19.0,29.0,37.0,46.0,90.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [16]:
married_salaries = clean_df[
    (
        clean_df["marital-status"]
        .str.startswith("Married")
    ) &
    (
        clean_df["sex"] == "Male"
    )
][["salary", 'salary K$']]
married_salaries_mean = married_salaries["salary K$"].mean()

non_married_salaries = clean_df[
    (
        ~clean_df["marital-status"]
        .str.startswith("Married")
    ) &
    (
        clean_df["sex"] == "Male"
    )
][["salary", 'salary K$']]
non_married_salaries_mean = non_married_salaries["salary K$"].mean()

pd.Series(
    [married_salaries_mean, non_married_salaries_mean],
    index=["married_mean", "non_married_mean"]
)

married_mean        107.494560
non_married_mean     46.597239
dtype: float64

# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [30]:
max_hours_per_week = clean_df["hours-per-week"].max()
count_people = clean_df[
    clean_df["hours-per-week"] == max_hours_per_week
].shape[0]

print(f"{count_people} people work {max_hours_per_week} hrs/week")

78 people work 99 hrs/week


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [120]:
clean_df["workclass_id"] = clean_df.groupby("workclass").ngroup()

clean_df["education_id"] = clean_df.groupby("education").ngroup()

clean_df["marital_status_id"] = clean_df.groupby("marital-status").ngroup()

clean_df["occupation_id"] = clean_df.groupby("occupation").ngroup()

clean_df["race_id"] = clean_df.groupby("race").ngroup()

clean_df["is_male"] = np.where(clean_df["sex"] == "Male", 1, 0)

clean_df["native_country_id"] = clean_df.groupby("native-country").ngroup()

res = clean_df.corr(numeric_only=True).drop(
    index="Unnamed: 0", columns="Unnamed: 0", axis=1
)

print(
    "There are lots of tendencies:\n"
    "1) The older a person is, the more hours a week he works (correlation coefficient 10%).\n"
    "2) The older a person is, the higher his annual salary is (correlation coefficient 20%).\n"
    "3) A person's age greatly depends on their marital status (correlation coefficient -28%).\n"
    "4) There is a dependence that the more time a person works in a week, "
    "the higher his salary (correlation coefficient 20%).\n"
    "5) Men spend more time at work per week than women (correlation coefficient 23%).\n"
    "6) Men's wages are higher than women's (correlation coefficient 18%).\n"
    "7) Race depends on a person's native country (obviously :))"
)
res


There are lots of tendencies:
1) The older a person is, the more hours a week he works (correlation coefficient 10%).
2) The older a person is, the higher his annual salary is (correlation coefficient 20%).
3) A person's age greatly depends on their marital status (correlation coefficient 41%).
4) There is a dependence that the more time a person works in a week, the higher his salary (correlation coefficient 20%).
5) Men spend more time at work per week than women (correlation coefficient 23%).
6) Men's wages are higher than women's (correlation coefficient 18%).
7) Race depends on a person's native country (obviously :))


Unnamed: 0,age,hours-per-week,salary K$,workclass_id,education_id,marital_status_id,occupation_id,race_id,is_male,native_country_id
age,1.0,0.101599,0.208203,0.08054,-0.001111,-0.276373,-0.005682,0.023374,0.081993,-0.001905
hours-per-week,0.101599,1.0,0.196378,0.050724,0.059887,-0.189003,0.018365,0.048532,0.231268,0.008408
salary K$,0.208203,0.196378,1.0,0.009948,0.067737,-0.165185,0.035984,0.060497,0.182642,0.019605
workclass_id,0.08054,0.050724,0.009948,1.0,0.017855,-0.034241,0.015572,0.044731,0.074973,0.007668
education_id,-0.001111,0.059887,0.067737,0.017855,1.0,-0.040664,-0.038212,0.011154,-0.027888,0.07879
marital_status_id,-0.276373,-0.189003,-0.165185,-0.034241,-0.040664,1.0,0.022655,-0.068627,-0.119813,-0.025902
occupation_id,-0.005682,0.018365,0.035984,0.015572,-0.038212,0.022655,1.0,0.000717,0.062313,-0.003483
race_id,0.023374,0.048532,0.060497,0.044731,0.011154,-0.068627,0.000717,1.0,0.089186,0.124514
is_male,0.081993,0.231268,0.182642,0.074973,-0.027888,-0.119813,0.062313,0.089186,1.0,0.000618
native_country_id,-0.001905,0.008408,0.019605,0.007668,0.07879,-0.025902,-0.003483,0.124514,0.000618,1.0
