In [1]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [13]:
df = pd.read_csv("../data/adult.csv")
df.replace("?", np.nan, inplace=True)
df.dropna(inplace=True)
df = df.drop(df[(df["salary"] == ">50K") & (df["salary K$"] <= 50)].index)
df = df.drop(df[(df["salary"] == "<=50K") & (df["salary K$"] > 50)].index)
df["salary"].value_counts()
df["salary K$"].describe()


count    30162.000000
mean        73.968570
std         85.365144
min         15.000000
25%         26.000000
50%         38.000000
75%         49.000000
max        349.000000
Name: salary K$, dtype: float64

# Task 1
Print the count of men and women in the dataset.

In [13]:
df["sex"].value_counts()

sex
Male      20380
Female     9782
Name: count, dtype: int64

# Task 2
Find the average age of men in dataset

In [18]:
df[df["sex"] == "Male"]["age"].mean()

np.float64(39.18400392541707)

# Task 3
Get the percentage of people from Poland (native-country)

In [28]:
poland_people = df[df["native-country"] == "Poland"].shape[0]
all_people = df["native-country"].shape[0]
(poland_people / all_people) * 100

0.18566408063125786

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [32]:
high_earn_people = df[df["salary"] == ">50K"]
low_ear_people = df[df["salary"] == "<=50K"]
mean_age_of_high_ear_people = high_earn_people["age"].mean()
std_age_of_high_ear_people = high_earn_people["age"].std()

mean_age_of_low_ear_people = low_ear_people["age"].mean()
std_age_of_low_ear_people = low_ear_people["age"].std()

[(mean_age_of_high_ear_people, std_age_of_high_ear_people), (mean_age_of_low_ear_people, std_age_of_low_ear_people)]

[(np.float64(43.95911028236548), np.float64(10.269632835673852)),
 (np.float64(36.60806038668668), np.float64(13.464631257161633))]

# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [35]:
mask = ["Bachelors", "Masters", "Doctorate", "Prof-school", "Assoc-acdm", "Assoc-voc"]
people_high_salary_without_education = df[(~df["education"].isin(mask)) & (df["salary"] == ">50K")]
has_people_high_salary_without_education = not people_high_salary_without_education.empty
[has_people_high_salary_without_education, people_high_salary_without_education]

[True,
        Unnamed: 0  age         workclass     education      marital-status  \
 7               7   52  Self-emp-not-inc       HS-grad  Married-civ-spouse   
 10             10   37           Private  Some-college  Married-civ-spouse   
 55             55   43           Private  Some-college  Married-civ-spouse   
 67             67   53           Private       HS-grad  Married-civ-spouse   
 68             68   49      Self-emp-inc  Some-college  Married-civ-spouse   
 ...           ...  ...               ...           ...                 ...   
 32462       32462   48      Self-emp-inc       HS-grad  Married-civ-spouse   
 32518       32518   57         Local-gov       HS-grad  Married-civ-spouse   
 32519       32519   46           Private  Some-college  Married-civ-spouse   
 32557       32557   40           Private       HS-grad  Married-civ-spouse   
 32560       32560   52      Self-emp-inc       HS-grad  Married-civ-spouse   
 
               occupation relationship   ra

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [36]:
df.groupby("education")["age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10th,820.0,37.897561,16.225795,17.0,23.0,36.0,52.0,90.0
11th,1048.0,32.36355,15.089307,17.0,18.0,28.5,43.0,90.0
12th,377.0,32.013263,14.37371,17.0,19.0,28.0,41.0,79.0
1st-4th,151.0,44.622517,14.929051,19.0,33.0,44.0,56.0,81.0
5th-6th,288.0,41.649306,14.754622,17.0,28.0,41.0,53.0,82.0
7th-8th,557.0,47.631957,15.737479,17.0,34.0,49.0,60.0,90.0
9th,455.0,40.303297,15.335754,17.0,28.0,38.0,53.0,90.0
Assoc-acdm,1008.0,37.286706,10.509755,19.0,29.0,36.0,44.0,90.0
Assoc-voc,1307.0,38.246366,11.181253,19.0,30.0,37.0,45.0,84.0
Bachelors,5044.0,38.641554,11.577566,19.0,29.0,37.0,46.0,90.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [15]:
mask = (df["sex"] == "Male") & (df["marital-status"].str.startswith("Married"))
maried = df[mask]
not_maried = df[(df["sex"] == "Male") & (~df["marital-status"].str.startswith("Married"))]
[maried["salary"].value_counts(), not_maried["salary"].value_counts()]

[salary
 <=50K    7052
 >50K     5723
 Name: count, dtype: int64,
 salary
 <=50K    6932
 >50K      673
 Name: count, dtype: int64]

# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [40]:
max_hours_per_week = df["hours-per-week"].max()
[max_hours_per_week, df[df["hours-per-week"] == max_hours_per_week].shape[0]]

[np.int64(99), 78]

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [48]:
pd.get_dummies(df).corr()

Unnamed: 0.1,Unnamed: 0,age,hours-per-week,salary K$,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,...,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,salary_<=50K,salary_>50K
Unnamed: 0,1.000000,-0.001126,-0.001890,0.000129,0.000275,-0.000933,0.005641,-0.003275,-0.006005,-0.001250,...,0.002413,-0.003646,-0.005262,-0.002669,0.007442,0.001993,-0.000928,0.004350,-0.006244,0.006244
age,-0.001126,1.000000,0.101599,0.208203,0.056626,0.068256,-0.210491,0.111039,0.150429,0.014805,...,0.000024,0.001923,-0.007879,-0.004940,0.007868,0.016259,-0.017775,0.000657,-0.241998,0.241998
hours-per-week,-0.001890,0.101599,1.000000,0.196378,0.005229,0.001612,-0.095533,0.126254,0.087835,-0.032976,...,0.000689,0.014328,-0.000305,0.012846,-0.007566,0.010673,-0.010381,0.006983,-0.229480,0.229480
salary K$,0.000129,0.208203,0.196378,1.000000,0.050512,0.025856,-0.098187,0.119222,0.019587,0.002678,...,-0.001498,-0.006514,0.016939,-0.000891,-0.003442,0.034000,-0.014737,0.008560,-0.853894,0.853894
workclass_Federal-gov,0.000275,0.056626,0.005229,0.050512,1.000000,-0.048728,-0.302194,-0.034520,-0.053995,-0.037804,...,-0.003431,-0.004795,-0.006708,-0.004266,-0.004390,0.017541,-0.000004,-0.004139,-0.057394,0.057394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
native-country_United-States,0.001993,0.016259,0.010673,0.034000,0.017541,0.032937,-0.049809,0.007350,0.014520,0.025370,...,-0.061442,-0.156254,-0.120121,-0.076390,-0.078606,1.000000,-0.148334,-0.074108,-0.040204,0.040204
native-country_Vietnam,-0.000928,-0.017775,-0.010381,-0.014737,-0.000004,-0.003952,0.009369,-0.004973,-0.003405,-0.006128,...,-0.000881,-0.002240,-0.001722,-0.001095,-0.001127,-0.148334,1.000000,-0.001062,0.018215,-0.018215
native-country_Yugoslavia,0.004350,0.000657,0.006983,0.008560,-0.004139,-0.000550,0.003861,0.003343,-0.001701,-0.004848,...,-0.000440,-0.001119,-0.000860,-0.000547,-0.000563,-0.074108,-0.001062,1.000000,-0.006718,0.006718
salary_<=50K,-0.006244,-0.241998,-0.229480,-0.853894,-0.057394,-0.028673,0.117218,-0.137646,-0.025575,-0.009752,...,0.002964,0.005812,-0.017571,0.003979,0.007788,-0.040204,0.018215,-0.006718,1.000000,-1.000000
