In [2]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [3]:
peoples = pd.read_csv("../data/adult.csv", index_col=0)
peoples = peoples.replace("?", np.nan)

# Task 1
Print the count of men and women in the dataset.

In [4]:
peoples["sex"].value_counts()

Male      21790
Female    10771
Name: sex, dtype: int64

# Task 2
Find the average age of men in dataset

In [5]:
mens_age = peoples[peoples["sex"] == "Male"]["age"]
np.average(mens_age)

39.43354749885268

# Task 3
Get the percentage of people from Poland (native-country)

In [6]:
native_country = peoples["native-country"].value_counts()

all_known_people = np.sum(native_country)
peoples_from_poland = native_country["Poland"]

percent_peoples_from_poland = peoples_from_poland / all_known_people * 100

percent_peoples_from_poland

0.18762899493401713

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [7]:
# for people who earn > 50k per year
mean_for_people_who_earn_higher_50k = np.mean(peoples["salary K$"] > 50)
std_for_people_who_earn_higher_50k = np.std(peoples["salary K$"] > 50)

mean_for_people_who_earn_higher_50k, std_for_people_who_earn_higher_50k

(0.2408095574460244, 0.4275749226611335)

In [8]:
# for people who earn <= 50k per year
mean_for_people_who_earn_lower_50k = np.mean(peoples["salary K$"] <= 50)
std_for_people_who_earn_lower_50k = np.std(peoples["salary K$"] <= 50)

mean_for_people_who_earn_lower_50k, std_for_people_who_earn_lower_50k

(0.7591904425539756, 0.4275749226611335)

# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [9]:
education = [
    "Bachelors",
    "Prof-school",
    "Assoc-acdm",
    "Assoc-voc",
    "Masters",
    "Doctorate"
]

peoples[(~peoples["education"].isin(education)) & (peoples["salary K$"] > 50)]

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
7,52,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,>50K,307
10,37,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,Black,Male,80,United-States,>50K,116
27,54,,Some-college,Married-civ-spouse,,Husband,Asian-Pac-Islander,Male,60,South,>50K,275
38,31,Private,Some-college,Married-civ-spouse,Sales,Husband,White,Male,38,,>50K,166
55,43,Private,Some-college,Married-civ-spouse,Tech-support,Husband,White,Male,40,United-States,>50K,341
...,...,...,...,...,...,...,...,...,...,...,...,...
32510,39,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,45,,>50K,212
32518,57,Local-gov,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,40,United-States,>50K,116
32519,46,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,48,United-States,>50K,239
32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K,173


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [10]:
grouped_people = peoples.groupby("education")
grouped_people.describe()

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,hours-per-week,hours-per-week,hours-per-week,hours-per-week,hours-per-week,salary K$,salary K$,salary K$,salary K$,salary K$,salary K$,salary K$,salary K$
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
10th,933.0,37.429796,16.720713,17.0,22.0,34.0,52.0,90.0,933.0,37.052519,...,40.0,99.0,933.0,42.839228,45.825943,15.0,25.0,33.0,43.0,347.0
11th,1175.0,32.355745,15.545485,17.0,18.0,28.0,43.0,90.0,1175.0,33.925957,...,40.0,99.0,1175.0,40.612766,44.638571,15.0,24.0,32.0,41.5,345.0
12th,433.0,32.0,14.334625,17.0,19.0,28.0,41.0,79.0,433.0,35.7806,...,40.0,99.0,433.0,42.91455,48.118195,15.0,24.0,33.0,42.0,349.0
1st-4th,168.0,46.142857,15.615625,19.0,33.0,46.0,57.0,90.0,168.0,38.255952,...,40.0,96.0,168.0,37.279762,32.745055,15.0,24.0,32.5,42.0,279.0
5th-6th,333.0,42.885886,15.557285,17.0,29.0,42.0,54.0,84.0,333.0,38.897898,...,40.0,84.0,333.0,40.339339,43.08274,15.0,23.0,33.0,43.0,347.0
7th-8th,646.0,48.44582,16.09235,17.0,34.25,50.0,61.0,90.0,646.0,39.366873,...,40.0,99.0,646.0,42.371517,46.883574,15.0,24.0,33.0,43.0,347.0
9th,514.0,41.060311,15.946862,17.0,28.0,39.0,54.0,90.0,514.0,38.044747,...,40.0,99.0,514.0,42.558366,49.175715,15.0,24.0,33.0,42.0,349.0
Assoc-acdm,1067.0,37.381443,11.095177,19.0,29.0,36.0,44.0,90.0,1067.0,40.504217,...,45.0,99.0,1067.0,74.99344,86.975212,15.0,26.5,38.0,49.0,349.0
Assoc-voc,1382.0,38.553546,11.6313,19.0,30.0,37.0,46.0,84.0,1382.0,41.610709,...,45.0,99.0,1382.0,76.188133,86.673738,15.0,27.0,39.0,62.75,349.0
Bachelors,5355.0,38.904949,11.91221,19.0,29.0,37.0,46.0,90.0,5355.0,42.614006,...,50.0,99.0,5355.0,101.882353,101.141882,15.0,29.0,44.0,171.0,349.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [11]:
married_people = peoples[peoples["marital-status"].str.startswith("Married")]
unmarried_people = peoples[~peoples["marital-status"].str.startswith("Married")]

np.sum(married_people["salary K$"]) > np.sum(unmarried_people["salary K$"])

True

# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [12]:
peoples.describe()["hours-per-week"]["max"]

99.0

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [22]:
correlation = peoples.corr()

In [23]:
print(
    "Correlation:\n"
    f"Hours per week to salary: {correlation['salary K$']['hours-per-week']}\n"
    f"Age to salary: {correlation['salary K$']['age']}"
)

Correlation:
Hours per week to salary: 0.19691628735118744
Age to salary: 0.20177417568021683
