In [1]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [2]:
df = pd.read_csv("../data/adult.csv")
df = df.replace('?', np.nan)
df['salary K$'] = pd.to_numeric(df['salary K$'], errors='coerce')
df['salary'] = df['salary'].astype('category')
df.dropna(subset=['salary', 'salary K$'], inplace=True, how='any')
df

Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
0,0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K,39
1,1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K,35
2,2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K,27
3,3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K,43
4,4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,32556,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,38,United-States,<=50K,36
32557,32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K,173
32558,32558,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States,<=50K,40
32559,32559,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,20,United-States,<=50K,38


# Task 1
Print the count of men and women in the dataset.

In [3]:
print("Men: ", df[df['sex'] == 'Male'].shape[0])
print("Women: ", df[df['sex'] == 'Female'].shape[0])

Men:  21790
Women:  10771


# Task 2
Find the average age of men in dataset

In [4]:
average_men_age = df[df['sex'] == 'Male']["age"].mean()
average_men_age

np.float64(39.43354749885268)

# Task 3
Get the percentage of people from Poland (native-country)

In [5]:
people_from_poland = df[df['native-country'] == 'Poland'].shape[0]
people_from_poland

60

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [6]:
deviation_more_50k = df[df['salary'] == '>50K']['age'].describe()[['mean', 'std']]
deviation_less_equal_50k = df[df['salary'] == '<=50K']['age'].describe()[['mean', 'std']]
print("Deviation more than 50k\n", deviation_more_50k)
print("Deviation less than or equal to 50k\n", deviation_less_equal_50k)

Deviation more than 50k
 mean    44.249841
std     10.519028
Name: age, dtype: float64
Deviation less than or equal to 50k
 mean    36.783738
std     14.020088
Name: age, dtype: float64


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [7]:
df[(df['salary'] == '>50K') & ~df["education"].isin(
    ['Bachelors', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', 'Masters', 'Doctorate'])].shape[0]

3306

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [8]:
df.groupby("education")["age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10th,933.0,37.429796,16.720713,17.0,22.0,34.0,52.0,90.0
11th,1175.0,32.355745,15.545485,17.0,18.0,28.0,43.0,90.0
12th,433.0,32.0,14.334625,17.0,19.0,28.0,41.0,79.0
1st-4th,168.0,46.142857,15.615625,19.0,33.0,46.0,57.0,90.0
5th-6th,333.0,42.885886,15.557285,17.0,29.0,42.0,54.0,84.0
7th-8th,646.0,48.44582,16.09235,17.0,34.25,50.0,61.0,90.0
9th,514.0,41.060311,15.946862,17.0,28.0,39.0,54.0,90.0
Assoc-acdm,1067.0,37.381443,11.095177,19.0,29.0,36.0,44.0,90.0
Assoc-voc,1382.0,38.553546,11.6313,19.0,30.0,37.0,46.0,84.0
Bachelors,5355.0,38.904949,11.91221,19.0,29.0,37.0,46.0,90.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [9]:
married_earn_more = (
        df[(df['salary'] == '>50K') & df["marital-status"].str.startswith("Married")]["salary K$"].mean()
    > df[(df['salary'] == '<=50K') & ~df["marital-status"].str.startswith("Married")]["salary K$"].mean()
)
if married_earn_more:
    print("Married men earn more")
else:
    print("Non-married men earn more")

Married men earn more


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [10]:
max_hours_per_week = df["hours-per-week"].max()
print("Max hours per week:", max_hours_per_week)
print("People with same amount of hours per week:", df[df["hours-per-week"] == max_hours_per_week].shape[0])

Max hours per week: 99
People with same amount of hours per week: 85


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [11]:
nums_only = df.select_dtypes(include=np.number)
musk = ~nums_only.columns.str.contains("Unnamed")
no_index = nums_only.loc[:, musk]
no_index.corr()

Unnamed: 0,age,hours-per-week,salary K$
age,1.0,0.068756,0.201774
hours-per-week,0.068756,1.0,0.196916
salary K$,0.201774,0.196916,1.0
