In [2]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [17]:
df = pd.DataFrame(pd.read_csv("../data/adult.csv"))
df = df.replace("?", np.nan).dropna()
df["salary"] = np.where(df["salary K$"] <= 50, "<=50K", ">50K")
len(df[df["salary K$"] < 51]), len(df[df["salary"] == "<=50K"])

(22654, 22654)

# Task 1
Print the count of men and women in the dataset.

In [19]:
df["sex"].value_counts()

Male      20380
Female     9782
Name: sex, dtype: int64

# Task 2
Find the average age of men in dataset

In [20]:
df[df["sex"] == "Male"]["age"].mean()

39.18400392541707

# Task 3
Get the percentage of people from Poland (native-country)

In [24]:
round(df[df['native-country'] == 'Poland'].shape[0] / df.shape[0] * 100, 5)

0.18566

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [30]:
high_earners = df[df["salary"] == ">50K"]
low_earners = df[df["salary"] == "<=50K"]

{"High mean": high_earners["age"].mean(),
 "High std": high_earners["age"].std(),
 "Low mean": low_earners["age"].mean(),
 "Low std": low_earners["age"].std()}

{'High mean': 43.95911028236548,
 'High std': 10.269632835673852,
 'Low mean': 36.60806038668668,
 'Low std': 13.464631257161633}

# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [42]:
df_filtered = df[~df["education"].isin(["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"])]
df_filtered[df_filtered["salary"] == ">50K"].head(10)

Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
7,7,52,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,>50K,307
10,10,37,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,Black,Male,80,United-States,>50K,116
55,55,43,Private,Some-college,Married-civ-spouse,Tech-support,Husband,White,Male,40,United-States,>50K,341
67,67,53,Private,HS-grad,Married-civ-spouse,Adm-clerical,Wife,White,Female,40,United-States,>50K,225
68,68,49,Self-emp-inc,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,50,United-States,>50K,194
84,84,44,Private,HS-grad,Divorced,Craft-repair,Not-in-family,White,Female,40,United-States,>50K,278
86,86,49,Local-gov,HS-grad,Married-civ-spouse,Protective-serv,Husband,White,Male,40,United-States,>50K,233
97,97,37,Private,Some-college,Married-civ-spouse,Sales,Husband,White,Male,48,United-States,>50K,217
105,105,32,Self-emp-inc,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,40,United-States,>50K,99
112,112,56,Self-emp-not-inc,HS-grad,Married-civ-spouse,Other-service,Husband,White,Male,50,Canada,>50K,197


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [39]:
df.groupby("education")["age"].describe().sort_values("count")

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Preschool,45.0,41.288889,15.175672,19.0,30.0,40.0,53.0,75.0
1st-4th,151.0,44.622517,14.929051,19.0,33.0,44.0,56.0,81.0
5th-6th,288.0,41.649306,14.754622,17.0,28.0,41.0,53.0,82.0
Doctorate,375.0,47.130667,11.471727,24.0,39.0,47.0,54.0,80.0
12th,377.0,32.013263,14.37371,17.0,19.0,28.0,41.0,79.0
9th,455.0,40.303297,15.335754,17.0,28.0,38.0,53.0,90.0
Prof-school,542.0,44.249077,11.428098,25.0,36.0,43.0,50.0,90.0
7th-8th,557.0,47.631957,15.737479,17.0,34.0,49.0,60.0,90.0
10th,820.0,37.897561,16.225795,17.0,23.0,36.0,52.0,90.0
Assoc-acdm,1008.0,37.286706,10.509755,19.0,29.0,36.0,44.0,90.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [43]:
df_married = df[(df["marital-status"].str.startswith("Married")) & (df["sex"] == "Male")]
df_non_married = df[(~df["marital-status"].str.startswith("Married")) & (df["sex"] == "Male")]

earn_more = "M"
if len(df_married[df_married["salary"] == ">50K"]) / len(df_married) < len(df_non_married[df_non_married["salary"] == ">50K"]) / len(df_non_married):
    earn_more = "Non-m"

earn_more + "arried men earns more"

'Married men earns more'

# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [44]:
max_hours_per_week = df["hours-per-week"].max()
len(df[df['hours-per-week'] == max_hours_per_week])

78

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [45]:
df.corr()

Unnamed: 0.1,Unnamed: 0,age,hours-per-week,salary K$
Unnamed: 0,1.0,-0.001126,-0.00189,0.000129
age,-0.001126,1.0,0.101599,0.208203
hours-per-week,-0.00189,0.101599,1.0,0.196378
salary K$,0.000129,0.208203,0.196378,1.0
