In [1]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [14]:
import os
adults_raw = pd.read_csv(os.path.join(os.pardir, "data", "adult.csv"), index_col=[0])
adults = adults_raw[~(adults_raw == "?").any(axis=1)]
check_below_50 = adults[(adults["salary"] == "<=50K") & (adults["salary K$"] > 50)]
check_over_50 = adults[(adults["salary"] == ">50K") & (adults["salary K$"] <= 50)]
if not all (check_over_50.count() == 0) or not all (check_below_50.count() == 0):
    print("there is something wrong with data")

# Task 1
Print the count of men and women in the dataset.

In [3]:
adults["sex"].value_counts()

Male      20380
Female     9782
Name: sex, dtype: int64

# Task 2
Find the average age of men in dataset

In [4]:
adults.loc[adults["sex"] == "Male", "age"].mean()

39.18400392541707

# Task 3
Get the percentage of people from Poland (native-country)

In [5]:
adults["native-country"].value_counts()["Poland"] / adults.shape[0] * 100

0.18566408063125786

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [15]:
over_fifty_salary = adults.loc[adults["salary"] == ">50K", "age"]
below_fifty_salary = adults.loc[adults["salary"] == "<=50K", "age"]
print(f"For people with over 50K per year age mean {over_fifty_salary.mean()} and age standard deviation {over_fifty_salary.std()}")
print(f"For people with below 50K per year age mean {below_fifty_salary.mean()} and age standard deviation {below_fifty_salary.std()}")

For people with over 50K per year age mean 43.95911028236548 and age standard deviation 10.269632835673852
For people with below 50K per year age mean 36.60806038668668 and age standard deviation 13.464631257161633


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [7]:
high_edu = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
adults.loc[(adults["salary"] == ">50K") & (~adults.education.isin(high_edu))].describe()

Unnamed: 0,age,hours-per-week,salary K$
count,3178.0,3178.0,3178.0
mean,44.559786,45.14034,202.485211
std,10.490596,10.95631,87.164738
min,19.0,3.0,51.0
25%,37.0,40.0,125.25
50%,44.0,40.0,206.0
75%,52.0,50.0,277.75
max,90.0,99.0,349.0


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [8]:
adults.groupby("education")["age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10th,820.0,37.897561,16.225795,17.0,23.0,36.0,52.0,90.0
11th,1048.0,32.36355,15.089307,17.0,18.0,28.5,43.0,90.0
12th,377.0,32.013263,14.37371,17.0,19.0,28.0,41.0,79.0
1st-4th,151.0,44.622517,14.929051,19.0,33.0,44.0,56.0,81.0
5th-6th,288.0,41.649306,14.754622,17.0,28.0,41.0,53.0,82.0
7th-8th,557.0,47.631957,15.737479,17.0,34.0,49.0,60.0,90.0
9th,455.0,40.303297,15.335754,17.0,28.0,38.0,53.0,90.0
Assoc-acdm,1008.0,37.286706,10.509755,19.0,29.0,36.0,44.0,90.0
Assoc-voc,1307.0,38.246366,11.181253,19.0,30.0,37.0,45.0,84.0
Bachelors,5044.0,38.641554,11.577566,19.0,29.0,37.0,46.0,90.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [9]:
adult_males = adults.loc[(adults["sex"] == "Male")].copy()
adult_males["marital-group"] = np.where(adult_males["marital-status"].str.startswith("Married"), "married", "non-married")
adult_males.groupby("marital-group")[["marital-group", "salary K$"]].mean()

Unnamed: 0_level_0,salary K$
marital-group,Unnamed: 1_level_1
married,107.49456
non-married,46.597239


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [10]:
adults.groupby("hours-per-week")["hours-per-week"].count()

hours-per-week
1      7
2     15
3     24
4     27
5     38
      ..
95     2
96     5
97     2
98    11
99    78
Name: hours-per-week, Length: 94, dtype: int64

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [11]:
adults.groupby("salary").corr()


Unnamed: 0_level_0,Unnamed: 1_level_0,age,hours-per-week,salary K$
salary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<=50K,age,1.0,0.08435,0.009416
<=50K,hours-per-week,0.08435,1.0,-0.002933
<=50K,salary K$,0.009416,-0.002933,1.0
>50K,age,1.0,-0.107335,0.003531
>50K,hours-per-week,-0.107335,1.0,0.003012
>50K,salary K$,0.003531,0.003012,1.0
