In [36]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [37]:
df = pd.read_csv("D:/Study/git-repos/py-adult-data-analysis/data/adult.csv")

df_cleaned = df.replace("?", np.nan).dropna()

df_cleaned["salary"] = df_cleaned["salary"].astype("category")
df_cleaned["salary K$"] = pd.to_numeric(df_cleaned["salary K$"], errors="coerce")

df_cleaned = df_cleaned.dropna(subset=["salary K$"])

# Task 1
Print the count of men and women in the dataset.

In [38]:
gender_count = df_cleaned["sex"].value_counts()
print(gender_count)

sex
Male      20380
Female     9782
Name: count, dtype: int64


# Task 2
Find the average age of men in dataset

In [39]:
average_age_men = df_cleaned[df_cleaned["sex"] == "Male"]["age"].mean()
print(f"Average age of men: {average_age_men}")

Average age of men: 39.18400392541707


# Task 3
Get the percentage of people from Poland (native-country)

In [40]:
poland_count = df_cleaned[df_cleaned["native-country"] == "Poland"].shape[0]
total_count = df_cleaned.shape[0]
poland_percentage = (poland_count / total_count) * 100
print(f"Percentage of people from Poland: {poland_percentage}%")

Percentage of people from Poland: 0.18566408063125786%


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [41]:
mean_age_50k_plus = df_cleaned[df_cleaned["salary"] == ">50K"]["age"].mean()
std_age_50k_plus = df_cleaned[df_cleaned["salary"] == ">50K"]["age"].std()

mean_age_50k_minus = df_cleaned[df_cleaned["salary"] == "<=50K"]["age"].mean()
std_age_50k_minus = df_cleaned[df_cleaned["salary"] == "<=50K"]["age"].std()

print(f"Mean and Std of age for >50K: {mean_age_50k_plus}, {std_age_50k_plus}")
print(f"Mean and Std of age for <=50K: {mean_age_50k_minus}, {std_age_50k_minus}")

Mean and Std of age for >50K: 43.95911028236548, 10.269632835673852
Mean and Std of age for <=50K: 36.60806038668668, 13.464631257161633


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [42]:
higher_education = [
    "Bachelors",
    "Prof-school",
    "Assoc-acdm",
    "Assoc-voc",
    "Masters",
    "Doctorate",
]

no_higher_ed_50k_plus = df_cleaned[
    ~df_cleaned["education"].isin(higher_education) & (df_cleaned["salary"] == ">50K")
]

print(f"People without higher education earning >50K: {no_higher_ed_50k_plus.shape[0]}")

People without higher education earning >50K: 3178


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [43]:
education_age_stats = df_cleaned.groupby("education")["age"].describe()
print(education_age_stats)

               count       mean        std   min   25%   50%   75%   max
education                                                               
10th           820.0  37.897561  16.225795  17.0  23.0  36.0  52.0  90.0
11th          1048.0  32.363550  15.089307  17.0  18.0  28.5  43.0  90.0
12th           377.0  32.013263  14.373710  17.0  19.0  28.0  41.0  79.0
1st-4th        151.0  44.622517  14.929051  19.0  33.0  44.0  56.0  81.0
5th-6th        288.0  41.649306  14.754622  17.0  28.0  41.0  53.0  82.0
7th-8th        557.0  47.631957  15.737479  17.0  34.0  49.0  60.0  90.0
9th            455.0  40.303297  15.335754  17.0  28.0  38.0  53.0  90.0
Assoc-acdm    1008.0  37.286706  10.509755  19.0  29.0  36.0  44.0  90.0
Assoc-voc     1307.0  38.246366  11.181253  19.0  30.0  37.0  45.0  84.0
Bachelors     5044.0  38.641554  11.577566  19.0  29.0  37.0  46.0  90.0
Doctorate      375.0  47.130667  11.471727  24.0  39.0  47.0  54.0  80.0
HS-grad       9840.0  38.640955  13.067730  17.0  2

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [44]:
married_men = df_cleaned[
    (df_cleaned["sex"] == "Male")
    & df_cleaned["marital-status"].str.startswith("Married")
]
non_married_men = df_cleaned[
    (df_cleaned["sex"] == "Male")
    & ~df_cleaned["marital-status"].str.startswith("Married")
]

married_men_salary = married_men["salary"].value_counts()
non_married_men_salary = non_married_men["salary"].value_counts()

print("Married men salary distribution:")
print(married_men_salary)

print("Non-married men salary distribution:")
print(non_married_men_salary)

Married men salary distribution:
salary
<=50K    7052
>50K     5723
Name: count, dtype: int64
Non-married men salary distribution:
salary
<=50K    6932
>50K      673
Name: count, dtype: int64


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [45]:
max_hours = df_cleaned["hours-per-week"].max()

max_hours_count = df_cleaned[df_cleaned["hours-per-week"] == max_hours].shape[0]

print(f"Max hours worked per week: {max_hours}")
print(f"Number of people who work {max_hours} hours: {max_hours_count}")

Max hours worked per week: 99
Number of people who work 99 hours: 78


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [48]:
df_cleaned = df_cleaned.drop(columns=["Unnamed: 0"])
numeric_columns = df_cleaned.select_dtypes(include=[np.number])

correlation_matrix = numeric_columns.corr()

print("Correlation matrix:")
print(correlation_matrix)

Correlation matrix:
                     age  hours-per-week  salary K$
age             1.000000        0.101599   0.208203
hours-per-week  0.101599        1.000000   0.196378
salary K$       0.208203        0.196378   1.000000
