In [1]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [21]:
df = pd.read_csv("../data/adult.csv")

# Видалення рядків з питанковими значеннями
rows_with_question_mark = df[df.apply(lambda row: any(pd.isnull(row) | (row == "?")), axis=1)]
df = df.drop(rows_with_question_mark.index)

salary_under_50K = df[(df["salary"] == "<=50K") & (df["salary K$"] > 50)]
salary_above_50K = df[(df["salary"] == ">50K") & (df["salary K$"] <= 50)]
print(f"Total amount of incorrect data for salary and salary $K columns are {len(salary_under_50K) + len(salary_above_50K)}")
df

Total amount of incorrect data for salary and salary $K columns are 0


Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
0,0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K,39
1,1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K,35
2,2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K,27
3,3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K,43
4,4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,32556,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,38,United-States,<=50K,36
32557,32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K,173
32558,32558,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States,<=50K,40
32559,32559,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,20,United-States,<=50K,38


# Task 1
Print the count of men and women in the dataset.

In [5]:
gender_counts = df["sex"].value_counts()
print("Count of Men:", gender_counts.get('Male', 0))
print("Count of Women:", gender_counts.get('Female', 0))

Count of Men: 20380
Count of Women: 9782


# Task 2
Find the average age of men in dataset

In [7]:
print("Average Age of Men:", df.loc[df["sex"] == "Male", "age"].mean())

Average Age of Men: 39.18400392541707


# Task 3
Get the percentage of people from Poland (native-country)

In [8]:
total_people = len(df)
poland_count = len(df[df["native-country"] == "Poland"])

percentage_poland = (poland_count / total_people) * 100

print("Percentage of People from Poland:", percentage_poland, "%")

Percentage of People from Poland: 0.18566408063125786 %


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [9]:
high_salary_age_mean = df[df["salary"] == ">50K"]["age"].mean()
high_salary_age_std = df[df["salary"] == ">50K"]["age"].std()

low_salary_age_mean = df[df["salary"] == "<=50K"]["age"].mean()
low_salary_age_std = df[df["salary"] == "<=50K"]["age"].std()

print("Mean Age for High salary:", high_salary_age_mean)
print("Standard Deviation of Age for High salary:", high_salary_age_std)
print("\nMean Age for Low salary:", low_salary_age_mean)
print("Standard Deviation of Age for Low salary:", low_salary_age_std)

Mean Age for High salary: 43.95911028236548
Standard Deviation of Age for High salary: 10.269632835673852

Mean Age for Low salary: 36.60806038668668
Standard Deviation of Age for Low salary: 13.464631257161633


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [10]:
higher_education_levels = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]

people_with_high_salary_no_higher_edu = df[(df["salary"] == ">50K") & (~df["education"].isin(higher_education_levels))]

print("People with > 50K salary but without higher education:")
print(people_with_high_salary_no_higher_edu[["education", "salary"]])

People with > 50K salary but without higher education:
          education salary
7           HS-grad   >50K
10     Some-college   >50K
55     Some-college   >50K
67          HS-grad   >50K
68     Some-college   >50K
...             ...    ...
32462       HS-grad   >50K
32518       HS-grad   >50K
32519  Some-college   >50K
32557       HS-grad   >50K
32560       HS-grad   >50K

[3178 rows x 2 columns]


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [12]:
education_age_stats = df.groupby("education")["age"].describe()

print(education_age_stats)

               count       mean        std   min   25%   50%   75%   max
education                                                               
10th           820.0  37.897561  16.225795  17.0  23.0  36.0  52.0  90.0
11th          1048.0  32.363550  15.089307  17.0  18.0  28.5  43.0  90.0
12th           377.0  32.013263  14.373710  17.0  19.0  28.0  41.0  79.0
1st-4th        151.0  44.622517  14.929051  19.0  33.0  44.0  56.0  81.0
5th-6th        288.0  41.649306  14.754622  17.0  28.0  41.0  53.0  82.0
7th-8th        557.0  47.631957  15.737479  17.0  34.0  49.0  60.0  90.0
9th            455.0  40.303297  15.335754  17.0  28.0  38.0  53.0  90.0
Assoc-acdm    1008.0  37.286706  10.509755  19.0  29.0  36.0  44.0  90.0
Assoc-voc     1307.0  38.246366  11.181253  19.0  30.0  37.0  45.0  84.0
Bachelors     5044.0  38.641554  11.577566  19.0  29.0  37.0  46.0  90.0
Doctorate      375.0  47.130667  11.471727  24.0  39.0  47.0  54.0  80.0
HS-grad       9840.0  38.640955  13.067730  17.0  2

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [43]:
married_men = df[(df["sex"] == "Male") & (df["marital-status"].str.startswith("Married"))]
non_married_men = df[(df["sex"] == "Male") & (~df["marital-status"].str.startswith("Married"))]

married_high_salary_percentage = (married_men["salary"] == ">50K").mean() * 100
non_married_high_salary_percentage = (non_married_men["salary"] == ">50K").mean() * 100

print("Married men who earn >50K: {:.2f}%".format(married_high_salary_percentage))
print("Non married men who earn >50K: {:.2f}%".format(non_married_high_salary_percentage))
if married_high_salary_percentage > non_married_high_salary_percentage:
    print("Married men earn more")
else:
    print("Non-married men earn more")

Married men who earn >50K: 44.80%
Non married men who earn >50K: 8.85%
Married men earn more


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [48]:
max_hours_per_week = df['hours-per-week'].max()

# Count how many people work the same amount of hours per week
people_with_max_hours = df[df['hours-per-week'] == max_hours_per_week]


same_hours_count = df["hours-per-week"].value_counts()
count_people_with_same_hours = len(same_hours_count > 1)

# Print the results
print("Maximum hours per week worked:", max_hours_per_week)
print("Number of people who work the same amount of hours per week:", count_people_with_same_hours)

Maximum hours per week worked: 99
Number of people who work the same amount of hours per week: 94


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [58]:
numeric_columns = df.select_dtypes(include=[np.number]).drop(columns=["Unnamed: 0"])
correlation_matrix = numeric_columns.corr()
print(correlation_matrix)
print("--------------------------------------------------")
print("Age and Salary: There is a positive correlation between age and salary (0.208203).")
print("This suggests that, on average, as a person's age increases, their salary tends to increase as well.")
print("This connection could be related to career progression and experience.")
print("--------------------------------------------------")
print("Hours per Week and Salary:")
print("There is a positive correlation between hours per week worked and salary (0.196378).")
print("This indicates that individuals who work more hours per week tend to have higher salaries on average.")
print("This connection might reflect higher earnings for individuals with more working hours.")
print("--------------------------------------------------")
print("Age and Hours per Week:")
print("There is a relatively weak positive correlation between age and hours per week worked (0.101599).")
print("This connection suggests that, on average, older individuals tend to work slightly more hours per week, but the correlation is not very strong.")
print("--------------------------------------------------")


                     age  hours-per-week  salary K$
age             1.000000        0.101599   0.208203
hours-per-week  0.101599        1.000000   0.196378
salary K$       0.208203        0.196378   1.000000
--------------------------------------------------
Age and Salary: There is a positive correlation between age and salary (0.208203).
This suggests that, on average, as a person's age increases, their salary tends to increase as well.
This connection could be related to career progression and experience.
--------------------------------------------------
Hours per Week and Salary:
There is a positive correlation between hours per week worked and salary (0.196378).
This indicates that individuals who work more hours per week tend to have higher salaries on average.
This connection might reflect higher earnings for individuals with more working hours.
--------------------------------------------------
Age and Hours per Week:
There is a relatively weak positive correlation between age