In [4]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [17]:
file_path = "../data/adult.csv"
df = pd.DataFrame(pd.read_csv(file_path, na_values="?"))
df = df.dropna()


assert ((df.loc[df['salary'] == '>50K', 'salary K$'] > 50).all() &
        (df.loc[df['salary'] == '<=50K', 'salary K$'] <= 50).all()), \
        "Error: Inconsistent data in the 'salary' and 'salary K$' columns"

# Task 1
Print the count of men and women in the dataset.

In [16]:
gender_counts = df['sex'].value_counts()
print(gender_counts)

Male      20380
Female     9782
Name: sex, dtype: int64


# Task 2
Find the average age of men in dataset

In [18]:
male_age_mean = df.loc[df['sex'] == 'Male', 'age'].mean()
print("Average age of men:", male_age_mean)

Average age of men: 39.18400392541707


# Task 3
Get the percentage of people from Poland (native-country)

In [19]:
poland_count = len(df.loc[df['native-country'] == 'Poland'])
total_count = len(df)
poland_percentage = poland_count / total_count * 100
print("Percentage of people from Poland:", poland_percentage)

Percentage of people from Poland: 0.18566408063125786


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [20]:
high_earners_age_mean = df.loc[df['salary'] == '>50K', 'age'].mean()
high_earners_age_std = df.loc[df['salary'] == '>50K', 'age'].std()
print("Mean age of high earners:", high_earners_age_mean)
print("Standard deviation of age of high earners:", high_earners_age_std)

low_earners_age_mean = df.loc[df['salary'] == '<=50K', 'age'].mean()
low_earners_age_std = df.loc[df['salary'] == '<=50K', 'age'].std()
print("Mean age of low earners:", low_earners_age_mean)
print("Standard deviation of age of low earners:", low_earners_age_std)

Mean age of high earners: 43.95911028236548
Standard deviation of age of high earners: 10.269632835673852
Mean age of low earners: 36.60806038668668
Standard deviation of age of low earners: 13.464631257161633


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [23]:
high_earners_no_higher_edu = df.loc[(df['salary'] == '>50K') & (~df['education'].isin(['Bachelors', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', 'Masters', 'Doctorate']))]
if len(high_earners_no_higher_edu) > 0:
    print(f"There are {len(high_earners_no_higher_edu)} people with >50K salary but without higher education.")
else:
    print("There are no people with >50K salary but without higher education.")

There are 3178 people with >50K salary but without higher education.


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [24]:
age_stats_by_edu = df.groupby('education')['age'].describe()
print(age_stats_by_edu)

               count       mean        std   min   25%   50%   75%   max
education                                                               
10th           820.0  37.897561  16.225795  17.0  23.0  36.0  52.0  90.0
11th          1048.0  32.363550  15.089307  17.0  18.0  28.5  43.0  90.0
12th           377.0  32.013263  14.373710  17.0  19.0  28.0  41.0  79.0
1st-4th        151.0  44.622517  14.929051  19.0  33.0  44.0  56.0  81.0
5th-6th        288.0  41.649306  14.754622  17.0  28.0  41.0  53.0  82.0
7th-8th        557.0  47.631957  15.737479  17.0  34.0  49.0  60.0  90.0
9th            455.0  40.303297  15.335754  17.0  28.0  38.0  53.0  90.0
Assoc-acdm    1008.0  37.286706  10.509755  19.0  29.0  36.0  44.0  90.0
Assoc-voc     1307.0  38.246366  11.181253  19.0  30.0  37.0  45.0  84.0
Bachelors     5044.0  38.641554  11.577566  19.0  29.0  37.0  46.0  90.0
Doctorate      375.0  47.130667  11.471727  24.0  39.0  47.0  54.0  80.0
HS-grad       9840.0  38.640955  13.067730  17.0  2

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [25]:
married_high_earners = len(df.loc[(df['marital-status'].str.startswith('Married')) & (df['sex'] == 'Male') & (df['salary'] == '>50K')])
married_low_earners = len(df.loc[(df['marital-status'].str.startswith('Married')) & (df['sex'] == 'Male') & (df['salary'] == '<=50K')])
non_married_high_earners = len(df.loc[(~df['marital-status'].str.startswith('Married')) & (df['sex'] == 'Male') & (df['salary'] == '>50K')])
non_married_low_earners = len(df.loc[(~df['marital-status'].str.startswith('Married')) & (df['sex'] == 'Male') & (df['salary'] == '<=50K')])
print("Married men who earn >50K:", married_high_earners)
print("Married men who earn <=50K:", married_low_earners)
print("Non-married men who earn >50K:", non_married_high_earners)
print("Non-married men who earn <=50K:", non_married_low_earners)

Married men who earn >50K: 5723
Married men who earn <=50K: 7052
Non-married men who earn >50K: 673
Non-married men who earn <=50K: 6932


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [26]:
max_hours_per_week = df['hours-per-week'].max()
print("Maximum hours per week worked:", max_hours_per_week)
same_hours_per_week_count = len(df.loc[df['hours-per-week'] == max_hours_per_week])
print("Number of people who work the same amount of hours per week:", same_hours_per_week_count)

Maximum hours per week worked: 99
Number of people who work the same amount of hours per week: 78


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [28]:
df = pd.read_csv("../data/adult.csv")

df.corr()

Unnamed: 0.1,Unnamed: 0,age,hours-per-week,salary K$
Unnamed: 0,1.0,0.001286,0.000607,-0.001666
age,0.001286,1.0,0.068756,0.201774
hours-per-week,0.000607,0.068756,1.0,0.196916
salary K$,-0.001666,0.201774,0.196916,1.0
