In [71]:
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [72]:
df = pd.read_csv("../data/adult.csv", index_col=0, na_values="?")
df.dropna(inplace=True)

df_filtered_50K = df[(df["salary"] == "<=50K") & (df["salary K$"] <= 50)]
df_filtered_50plusK = df[(df["salary"] == ">50K") & (df["salary K$"] > 50)]
df_cleaned = pd.concat([df_filtered_50K, df_filtered_50plusK])
df_cleaned.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K,39
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K,35
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K,27
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K,43
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K,25


# Task 1
Print the count of men and women in the dataset.

In [73]:
gender_counts = df_cleaned["sex"].value_counts()
print(gender_counts)

Male      20380
Female     9782
Name: sex, dtype: int64


# Task 2
Find the average age of men in dataset

In [74]:
men_data = df_cleaned[df_cleaned["sex"] == "Male"]
average_age_men = men_data["age"].mean()
print(average_age_men)

39.18400392541707


# Task 3
Get the percentage of people from Poland (native-country)

In [75]:
people_from_poland = df_cleaned[df_cleaned["native-country"] == "Poland"].shape[0]
total_people = df_cleaned.shape[0]
percentage_poland = (people_from_poland / total_people) * 100
print(percentage_poland)

0.18566408063125786


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [76]:
high_income = df_cleaned[df_cleaned["salary"] == ">50K"]
mean_age_high_income = high_income["age"].mean()
std_dev_age_high_income = high_income["age"].std()
print(mean_age_high_income)
print(std_dev_age_high_income)

low_income = df_cleaned[df_cleaned["salary"] == "<=50K"]
mean_age_low_income = low_income["age"].mean()
std_dev_age_low_income = low_income["age"].std()
print(mean_age_low_income)
print(std_dev_age_low_income)


43.95911028236548
10.269632835673852
36.60806038668668
13.464631257161633


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [77]:
higher_education = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
no_higher_education_high_income = df_cleaned[
    (df_cleaned["salary"] == ">50K") & 
    (~df_cleaned["education"].isin(higher_education))
]
print(no_higher_education_high_income.shape[0])


3178


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [78]:
df_cleaned.groupby("education")["age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10th,820.0,37.897561,16.225795,17.0,23.0,36.0,52.0,90.0
11th,1048.0,32.36355,15.089307,17.0,18.0,28.5,43.0,90.0
12th,377.0,32.013263,14.37371,17.0,19.0,28.0,41.0,79.0
1st-4th,151.0,44.622517,14.929051,19.0,33.0,44.0,56.0,81.0
5th-6th,288.0,41.649306,14.754622,17.0,28.0,41.0,53.0,82.0
7th-8th,557.0,47.631957,15.737479,17.0,34.0,49.0,60.0,90.0
9th,455.0,40.303297,15.335754,17.0,28.0,38.0,53.0,90.0
Assoc-acdm,1008.0,37.286706,10.509755,19.0,29.0,36.0,44.0,90.0
Assoc-voc,1307.0,38.246366,11.181253,19.0,30.0,37.0,45.0,84.0
Bachelors,5044.0,38.641554,11.577566,19.0,29.0,37.0,46.0,90.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [79]:
married_men = df_cleaned[(df_cleaned["sex"] == "Male") & (df_cleaned["marital-status"].str.startswith("Married"))]
single_men = df_cleaned[(df_cleaned["sex"] == "Male") & (~df_cleaned["marital-status"].str.startswith("Married"))]

married_men_salary_over_50k = married_men[married_men["salary"] == ">50K"].shape[0]
married_men_salary_under_50k = married_men[married_men["salary"] == "<=50K"].shape[0]

single_men_salary_over_50k = single_men[single_men["salary"] == ">50K"].shape[0]
single_men_salary_under_50k = single_men[single_men["salary"] == "<=50K"].shape[0]

print(f"Married >50K: {married_men_salary_over_50k}")
print(f"Married <=50K: {married_men_salary_under_50k}")
print(f"Not married >50K: {single_men_salary_over_50k}")
print(f"Not married <=50K: {single_men_salary_under_50k}")

Married >50K: 5723
Married <=50K: 7052
Not married >50K: 673
Not married <=50K: 6932


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [80]:
max_hours = df_cleaned["hours-per-week"].max()
num_people_max_hours = df_cleaned[df_cleaned["hours-per-week"] == max_hours].shape[0]
print(f"max_hours: {max_hours}")
print(f"num_people_max_hours {num_people_max_hours}")

max_hours: 99
num_people_max_hours 78


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [81]:
print(df_cleaned.select_dtypes(include=["int64", "float64"]).corr())

                     age  hours-per-week  salary K$
age             1.000000        0.101599   0.208203
hours-per-week  0.101599        1.000000   0.196378
salary K$       0.208203        0.196378   1.000000
