In [3]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [4]:
adult_df = pd.read_csv("../data/adult.csv", index_col=[0])
adult_df = adult_df.replace("?", np.nan).dropna()
salary_check_df = (adult_df["salary"] == "<=50K") == (adult_df["salary K$"] <= 50)
salary_check_df.all()

True

# Task 1
Print the count of men and women in the dataset.

In [5]:
adult_df["sex"].value_counts()

Male      20380
Female     9782
Name: sex, dtype: int64

# Task 2
Find the average age of men in dataset

In [6]:
adult_df[adult_df["sex"] == "Male"]["age"].mean()

39.18400392541707

# Task 3
Get the percentage of people from Poland (native-country)

In [7]:
(len(adult_df[adult_df["native-country"] == "Poland"]) / len(adult_df["native-country"])) * 100

0.18566408063125786

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [8]:
salary_above_50 = adult_df[adult_df["salary"] == ">50K"]["age"]
print(f"Above 50K: mean age: {salary_above_50.mean()}, standard deviation of the age: {salary_above_50.std()};")

Above 50K: mean age: 43.95911028236548, standard deviation of the age: 10.269632835673852;


In [9]:
salary_below_or_50 = adult_df[adult_df["salary"] == "<=50K"]["age"]
print(f"Below or 50K: mean age: {salary_below_or_50.mean()}, standard deviation of the age: {salary_below_or_50.std()}")

Below or 50K: mean age: 36.60806038668668, standard deviation of the age: 13.464631257161633


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [10]:
higher_education = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
len(adult_df[(~adult_df.education.isin(higher_education)) & (adult_df["salary"] == ">50K")])

3178

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [11]:
adult_df.groupby(["education"])["age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10th,820.0,37.897561,16.225795,17.0,23.0,36.0,52.0,90.0
11th,1048.0,32.36355,15.089307,17.0,18.0,28.5,43.0,90.0
12th,377.0,32.013263,14.37371,17.0,19.0,28.0,41.0,79.0
1st-4th,151.0,44.622517,14.929051,19.0,33.0,44.0,56.0,81.0
5th-6th,288.0,41.649306,14.754622,17.0,28.0,41.0,53.0,82.0
7th-8th,557.0,47.631957,15.737479,17.0,34.0,49.0,60.0,90.0
9th,455.0,40.303297,15.335754,17.0,28.0,38.0,53.0,90.0
Assoc-acdm,1008.0,37.286706,10.509755,19.0,29.0,36.0,44.0,90.0
Assoc-voc,1307.0,38.246366,11.181253,19.0,30.0,37.0,45.0,84.0
Bachelors,5044.0,38.641554,11.577566,19.0,29.0,37.0,46.0,90.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [12]:
married_men = adult_df[(adult_df["sex"] == "Male") & (adult_df["marital-status"] == "Married-civ-spouse")]["salary K$"].mean()
other_men = adult_df[(adult_df["sex"] == "Male") & (adult_df["marital-status"] != "Married-civ-spouse")]["salary K$"].mean()
print(f"average salary married: {married_men}, other: {other_men}")

average salary married: 108.3160905840286, other: 46.75522771007056


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [13]:
max_hours_per_week = adult_df["hours-per-week"].max()
len(adult_df[adult_df["hours-per-week"] == max_hours_per_week])

78

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [30]:
nonumeric_columns = adult_df.select_dtypes(exclude="int64").columns

for category in nonumeric_columns:
    adult_df[category] = adult_df[category].astype("category").cat.codes

adult_df.corr()



Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
age,1.0,0.08054,-0.001111,-0.276373,-0.005682,-0.246456,0.023374,0.081993,0.101599,-0.001905,0.241998,0.208203
workclass,0.08054,1.0,0.017855,-0.034241,0.015572,-0.067417,0.044731,0.074973,0.050724,0.007668,0.018044,0.009948
education,-0.001111,0.017855,1.0,-0.040664,-0.038212,-0.012717,0.011154,-0.027888,0.059887,0.07879,0.078987,0.067737
marital-status,-0.276373,-0.034241,-0.040664,1.0,0.022655,0.177964,-0.068627,-0.119813,-0.189003,-0.025902,-0.193518,-0.165185
occupation,-0.005682,0.015572,-0.038212,0.022655,1.0,-0.053727,0.000717,0.062313,0.018365,-0.003483,0.051577,0.035984
relationship,-0.246456,-0.067417,-0.012717,0.177964,-0.053727,1.0,-0.117143,-0.584876,-0.25785,-0.010809,-0.251003,-0.210894
race,0.023374,0.044731,0.011154,-0.068627,0.000717,-0.117143,1.0,0.089186,0.048532,0.124514,0.071658,0.060497
sex,0.081993,0.074973,-0.027888,-0.119813,0.062313,-0.584876,0.089186,1.0,0.231268,0.000618,0.216699,0.182642
hours-per-week,0.101599,0.050724,0.059887,-0.189003,0.018365,-0.25785,0.048532,0.231268,1.0,0.008408,0.22948,0.196378
native-country,-0.001905,0.007668,0.07879,-0.025902,-0.003483,-0.010809,0.124514,0.000618,0.008408,1.0,0.023268,0.019605


In [31]:
print(f"1) Age and Salary: There is a positive correlation between age and salary, which is expected. As people gain more experience and seniority in their careers, they tend to earn higher salaries.\n2) Hours-per-week and Salary: There is a positive correlation between hours-per-week and salary. This suggests that people who work longer hours tend to have higher salaries, which is intuitive as many high-paying jobs often require longer hours or involve overtime.\n3) Education and Salary: Education and salary are positively correlated, indicating that higher levels of education tend to be associated with higher salaries. \n4) Decrease in Marital Status with Age: The negative correlation indicates that as age increases, there is a tendency for marital status to decrease. In other words, older individuals are more likely to be unmarried, divorced, or widowed compared to younger individuals.")

1) Age and Salary: There is a positive correlation between age and salary, which is expected. As people gain more experience and seniority in their careers, they tend to earn higher salaries.
2) Hours-per-week and Salary: There is a positive correlation between hours-per-week and salary. This suggests that people who work longer hours tend to have higher salaries, which is intuitive as many high-paying jobs often require longer hours or involve overtime.
3) Education and Salary: Education and salary are positively correlated, indicating that higher levels of education tend to be associated with higher salaries. 
4) Decrease in Marital Status with Age: The negative correlation indicates that as age increases, there is a tendency for marital status to decrease. In other words, older individuals are more likely to be unmarried, divorced, or widowed compared to younger individuals.
