In [55]:
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [56]:
adults = pd.read_csv("../data/adult.csv", index_col=[0], na_values="?")
adults.dropna(inplace=True)
adults = adults[
    ((adults["salary"] == "<=50K") & (adults["salary K$"] <= 50)) |
    ((adults["salary"] == ">50K") & (adults["salary K$"] > 50))
]
adults.reset_index(drop=True, inplace=True)

# Task 1
Print the count of men and women in the dataset.

In [57]:
adults[adults["sex"] == "Male"].count()

age               20380
workclass         20380
education         20380
marital-status    20380
occupation        20380
relationship      20380
race              20380
sex               20380
hours-per-week    20380
native-country    20380
salary            20380
salary K$         20380
dtype: int64

In [58]:
adults["sex"].value_counts()

Male      20380
Female     9782
Name: sex, dtype: int64

# Task 2
Find the average age of men in dataset

In [59]:
adults[adults["sex"] == "Male"]["age"].mean()

39.18400392541707

# Task 3
Get the percentage of people from Poland (native-country)

In [60]:
people_from_poland = adults[adults["native-country"] == "Poland"]["native-country"].count()
poles_percentage =100 / adults.shape[0] * people_from_poland

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [61]:
more_50K = adults[adults["salary"] == ">50K"]["age"]
more_50K.mean()
more_50K.std()
less_50K = adults[~(adults["salary"] == ">50K")]["age"]
less_50K.mean()
less_50K.std()

13.464631257161633

# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [62]:
high_education = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
adults[~(adults["education"].isin(high_education)) & (adults["salary"] == ">50K")]

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
7,52,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,>50K,307
10,37,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,Black,Male,80,United-States,>50K,116
51,43,Private,Some-college,Married-civ-spouse,Tech-support,Husband,White,Male,40,United-States,>50K,341
62,53,Private,HS-grad,Married-civ-spouse,Adm-clerical,Wife,White,Female,40,United-States,>50K,225
63,49,Self-emp-inc,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,50,United-States,>50K,194
...,...,...,...,...,...,...,...,...,...,...,...,...
30075,48,Self-emp-inc,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,50,United-States,>50K,343
30125,57,Local-gov,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,40,United-States,>50K,116
30126,46,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,48,United-States,>50K,239
30158,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K,173


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [63]:
adults.groupby(by="education")["age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10th,820.0,37.897561,16.225795,17.0,23.0,36.0,52.0,90.0
11th,1048.0,32.36355,15.089307,17.0,18.0,28.5,43.0,90.0
12th,377.0,32.013263,14.37371,17.0,19.0,28.0,41.0,79.0
1st-4th,151.0,44.622517,14.929051,19.0,33.0,44.0,56.0,81.0
5th-6th,288.0,41.649306,14.754622,17.0,28.0,41.0,53.0,82.0
7th-8th,557.0,47.631957,15.737479,17.0,34.0,49.0,60.0,90.0
9th,455.0,40.303297,15.335754,17.0,28.0,38.0,53.0,90.0
Assoc-acdm,1008.0,37.286706,10.509755,19.0,29.0,36.0,44.0,90.0
Assoc-voc,1307.0,38.246366,11.181253,19.0,30.0,37.0,45.0,84.0
Bachelors,5044.0,38.641554,11.577566,19.0,29.0,37.0,46.0,90.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [64]:
men = adults[adults["sex"] == "Male"]
married_men = men[men["marital-status"].str.startswith("Married")]
not_married_men = men[~(men["marital-status"].str.startswith("Married"))]
married_more_50K = married_men[married_men["salary"] == ">50K"].shape[0]
married_less_50K = married_men[married_men["salary"] == "<=50K"].shape[0]
not_married_more_50K = not_married_men[not_married_men["salary"] == ">50K"].shape[0]
not_married_less_50K = not_married_men[not_married_men["salary"] == "<=50K"].shape[0]
max(married_more_50K, not_married_more_50K)
max(married_less_50K, not_married_less_50K)

7052

# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [65]:
adults.groupby(by="hours-per-week").size()
adults[adults["hours-per-week"] == adults["hours-per-week"].max()].shape[0]

78

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight tier connection.

In [69]:
correlation_matrix = pd.get_dummies(adults).corr()

# we can look for strong correlations manually
age_and_income_corr = correlation_matrix.loc["age", "salary K$"]
hours_per_week_and_income_corr = correlation_matrix.loc["hours-per-week", "salary K$"]
workclass_private_and_age_corr = correlation_matrix.loc["workclass_Private", "age"]
education_bachelors_and_income_corr = correlation_matrix.loc["education_Bachelors", "salary K$"]
maried_civ_spouse_and_age_corr = correlation_matrix.loc["marital-status_Married-civ-spouse", "age"]
maried_civ_spouse_and_income_corr = correlation_matrix.loc["marital-status_Married-civ-spouse", "salary K$"]
never_married_and_age_corr = correlation_matrix.loc["marital-status_Never-married", "age"]
never_married_and_income_corr = correlation_matrix.loc["marital-status_Never-married", "salary K$"]

# or look in the general table with filtered data
strong_corr = correlation_matrix[(correlation_matrix > 0.2) | (correlation_matrix < -0.2)]
strong_corr

Unnamed: 0,age,hours-per-week,salary K$,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,...,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,salary_<=50K,salary_>50K
age,1.000000,,0.208203,,,-0.210491,,,,,...,,,,,,,,,-0.241998,0.241998
hours-per-week,,1.00000,,,,,,,,,...,,,,,,,,,-0.229480,0.229480
salary K$,0.208203,,1.000000,,,,,,,,...,,,,,,,,,-0.853894,0.853894
workclass_Federal-gov,,,,1.0,,-0.302194,,,,,...,,,,,,,,,,
workclass_Local-gov,,,,,1.0,-0.456267,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
native-country_United-States,,,,,,,,,,,...,,,,,,1.0,,,,
native-country_Vietnam,,,,,,,,,,,...,,,,,,,1.0,,,
native-country_Yugoslavia,,,,,,,,,,,...,,,,,,,,1.0,,
salary_<=50K,-0.241998,-0.22948,-0.853894,,,,,,,,...,,,,,,,,,1.000000,-1.000000
