In [1]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [2]:
df = pd.read_csv("../data/adult.csv")
df = df.replace("?", np.nan)
df = df.dropna()
df = df[df["salary"].isin(["<=50K", ">50K"])]
df["salary K$"] = pd.to_numeric(df["salary K$"], errors="coerce")
df = df.dropna(subset=["salary K$"])
df

Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
0,0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K,39
1,1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K,35
2,2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K,27
3,3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K,43
4,4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,32556,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,38,United-States,<=50K,36
32557,32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K,173
32558,32558,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States,<=50K,40
32559,32559,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,20,United-States,<=50K,38


# Task 1
Print the count of men and women in the dataset.

In [4]:
gender_counts = df["sex"].value_counts()
print("Кількість чоловіків і жінок у наборі даних:")
print(gender_counts)

Кількість чоловіків і жінок у наборі даних:
sex
Male      20380
Female     9782
Name: count, dtype: int64


# Task 2
Find the average age of men in dataset

In [7]:
average_age_men = df[df["sex"] == "Male"]["age"].mean()
print("Середній вік чоловіків у наборі даних:", round(average_age_men, 2))

Середній вік чоловіків у наборі даних: 39.18


# Task 3
Get the percentage of people from Poland (native-country)

In [9]:
total_people = len(df)
poland_count = len(df[df["native-country"] == "Poland"])
poland_percentage = (poland_count / total_people) * 100
print(f"Відсоток людей з Польщі: {poland_percentage:.2f}%")

Відсоток людей з Польщі: 0.19%


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [10]:
high_income = df[df["salary"] == ">50K"]["age"]
mean_high_income = high_income.mean()
std_high_income = high_income.std()

low_income = df[df["salary"] == "<=50K"]["age"]
mean_low_income = low_income.mean()
std_low_income = low_income.std()

print(f"Середній вік для тих, хто заробляє >50K: {mean_high_income:.2f}")
print(f"Стандартне відхилення віку для тих, хто заробляє >50K: {std_high_income:.2f}")
print(f"Середній вік для тих, хто заробляє <=50K: {mean_low_income:.2f}")
print(f"Стандартне відхилення віку для тих, хто заробляє <=50K: {std_low_income:.2f}")

Середній вік для тих, хто заробляє >50K: 43.96
Стандартне відхилення віку для тих, хто заробляє >50K: 10.27
Середній вік для тих, хто заробляє <=50K: 36.61
Стандартне відхилення віку для тих, хто заробляє <=50K: 13.46


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [12]:
higher_education = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]

non_higher_educated_high_income = df[(df["salary"] == ">50K") & (~df["education"].isin(higher_education))]

print("Люди без вищої освіти, але з зарплатою понад 50K:")
non_higher_educated_high_income

Люди без вищої освіти, але з зарплатою понад 50K:


Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
7,7,52,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,>50K,307
10,10,37,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,Black,Male,80,United-States,>50K,116
55,55,43,Private,Some-college,Married-civ-spouse,Tech-support,Husband,White,Male,40,United-States,>50K,341
67,67,53,Private,HS-grad,Married-civ-spouse,Adm-clerical,Wife,White,Female,40,United-States,>50K,225
68,68,49,Self-emp-inc,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,50,United-States,>50K,194
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32462,32462,48,Self-emp-inc,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,50,United-States,>50K,343
32518,32518,57,Local-gov,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,40,United-States,>50K,116
32519,32519,46,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,48,United-States,>50K,239
32557,32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K,173


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [13]:
age_statistics_by_education = df.groupby("education")["age"].describe()

print("Статистика віку для кожного типу освіти:")
print(age_statistics_by_education)

Статистика віку для кожного типу освіти:
               count       mean        std   min   25%   50%   75%   max
education                                                               
10th           820.0  37.897561  16.225795  17.0  23.0  36.0  52.0  90.0
11th          1048.0  32.363550  15.089307  17.0  18.0  28.5  43.0  90.0
12th           377.0  32.013263  14.373710  17.0  19.0  28.0  41.0  79.0
1st-4th        151.0  44.622517  14.929051  19.0  33.0  44.0  56.0  81.0
5th-6th        288.0  41.649306  14.754622  17.0  28.0  41.0  53.0  82.0
7th-8th        557.0  47.631957  15.737479  17.0  34.0  49.0  60.0  90.0
9th            455.0  40.303297  15.335754  17.0  28.0  38.0  53.0  90.0
Assoc-acdm    1008.0  37.286706  10.509755  19.0  29.0  36.0  44.0  90.0
Assoc-voc     1307.0  38.246366  11.181253  19.0  30.0  37.0  45.0  84.0
Bachelors     5044.0  38.641554  11.577566  19.0  29.0  37.0  46.0  90.0
Doctorate      375.0  47.130667  11.471727  24.0  39.0  47.0  54.0  80.0
HS-grad   

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [16]:
married_people = df[df["marital-status"].str.startswith("Married")]
non_married_people = df[~df["marital-status"].str.startswith("Married")]

married_high_income = married_people[married_people["salary"] == ">50K"].shape[0] / married_people.shape[0] * 100
non_married_high_income = non_married_people[non_married_people["salary"] == ">50K"].shape[0] / non_married_people.shape[0] * 100

print("Відсоток одружених людей з доходом більше 50K:", round(married_high_income, 2), "%")
print("Відсоток неодружених людей з доходом більше 50K:", round(non_married_high_income, 2), "%")

Відсоток одружених людей з доходом більше 50K: 44.55 %
Відсоток неодружених людей з доходом більше 50K: 6.8 %


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [17]:
max_hours = df["hours-per-week"].max()

people_with_max_hours = df[df["hours-per-week"] == max_hours].shape[0]

print("Максимальна кількість годин на тиждень:", max_hours)
print("Кількість людей, які працюють стільки годин:", people_with_max_hours)

Максимальна кількість годин на тиждень: 99
Кількість людей, які працюють стільки годин: 78


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [20]:
correlation_matrix = df.select_dtypes(include=["number"]).corr()
print(correlation_matrix)

                Unnamed: 0       age  hours-per-week  salary K$
Unnamed: 0        1.000000 -0.001126       -0.001890   0.000129
age              -0.001126  1.000000        0.101599   0.208203
hours-per-week   -0.001890  0.101599        1.000000   0.196378
salary K$         0.000129  0.208203        0.196378   1.000000
