In [3]:
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [4]:
adult = pd.read_csv("../data/adult.csv")
adult = adult[~adult.isin(['?']).any(axis=1)]
adult.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 0 to 32560
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      30162 non-null  int64 
 1   age             30162 non-null  int64 
 2   workclass       30162 non-null  object
 3   education       30162 non-null  object
 4   marital-status  30162 non-null  object
 5   occupation      30162 non-null  object
 6   relationship    30162 non-null  object
 7   race            30162 non-null  object
 8   sex             30162 non-null  object
 9   hours-per-week  30162 non-null  int64 
 10  native-country  30162 non-null  object
 11  salary          30162 non-null  object
 12  salary K$       30162 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 3.2+ MB


In [5]:
salary_1_less_50 = adult[adult["salary K$"] <= 50]
salary_2_less_50 = adult[adult["salary"].eq("<=50K")]
equal_less_50 = salary_1_less_50.equals(salary_2_less_50)
equal_less_50

True

In [6]:
salary_1_more_50 = adult[adult["salary K$"] > 50]
salary_2_more_50 = adult[adult["salary"].eq(">50K")]
equal_more_50 = salary_1_more_50.equals(salary_2_more_50)
equal_more_50

True

# Task 1
Print the count of men and women in the dataset.

In [7]:
adult["sex"].value_counts()

Male      20380
Female     9782
Name: sex, dtype: int64

# Task 2
Find the average age of men in dataset

In [8]:
average = adult[adult["sex"] == "Male"]["age"].mean()
average

39.18400392541707

# Task 3
Get the percentage of people from Poland (native-country)

In [9]:
percentage = adult["native-country"].value_counts(normalize=True)["Poland"] * 100
percentage

0.18566408063125786

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [10]:
mean_less_50 = adult[adult["salary K$"] <= 50]["age"].mean()
mean_less_50

36.60806038668668

In [11]:
standard_deviation_less_50 = adult[adult["salary K$"] <= 50]["age"].std()
standard_deviation_less_50

13.464631257161633

In [12]:
mean_more_50 = adult[adult["salary K$"] > 50]["age"].mean()
mean_more_50

43.95911028236548

In [13]:
standard_deviation_more_50 = adult[adult["salary K$"] > 50]["age"].std()
standard_deviation_more_50

10.269632835673852

# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [14]:
not_education = adult[adult["salary"] == ">50K"]["education"].value_counts()
not_education

Bachelors       2126
HS-grad         1617
Some-college    1336
Masters          918
Prof-school      406
Assoc-voc        344
Doctorate        280
Assoc-acdm       256
10th              59
11th              59
7th-8th           35
12th              29
9th               25
5th-6th           12
1st-4th            6
Name: education, dtype: int64

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [15]:
stat = adult.groupby("education")["age"].describe()
stat

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10th,820.0,37.897561,16.225795,17.0,23.0,36.0,52.0,90.0
11th,1048.0,32.36355,15.089307,17.0,18.0,28.5,43.0,90.0
12th,377.0,32.013263,14.37371,17.0,19.0,28.0,41.0,79.0
1st-4th,151.0,44.622517,14.929051,19.0,33.0,44.0,56.0,81.0
5th-6th,288.0,41.649306,14.754622,17.0,28.0,41.0,53.0,82.0
7th-8th,557.0,47.631957,15.737479,17.0,34.0,49.0,60.0,90.0
9th,455.0,40.303297,15.335754,17.0,28.0,38.0,53.0,90.0
Assoc-acdm,1008.0,37.286706,10.509755,19.0,29.0,36.0,44.0,90.0
Assoc-voc,1307.0,38.246366,11.181253,19.0,30.0,37.0,45.0,84.0
Bachelors,5044.0,38.641554,11.577566,19.0,29.0,37.0,46.0,90.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [16]:
only_men = adult[adult["sex"] == "Male"]
only_men

Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
0,0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K,39
1,1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K,35
2,2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K,27
3,3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K,43
7,7,52,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,>50K,307
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32553,32553,32,Private,Masters,Never-married,Tech-support,Not-in-family,Asian-Pac-Islander,Male,11,Taiwan,<=50K,36
32554,32554,53,Private,Masters,Married-civ-spouse,Exec-managerial,Husband,White,Male,40,United-States,>50K,103
32555,32555,22,Private,Some-college,Never-married,Protective-serv,Not-in-family,White,Male,40,United-States,<=50K,32
32557,32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K,173


In [17]:
married_man = only_men[only_men["marital-status"].str.contains("Married")]["salary"].value_counts()
married_man

<=50K    7052
>50K     5723
Name: salary, dtype: int64

In [18]:
not_married_man = only_men[~only_men["marital-status"].str.contains("Married")]["salary"].value_counts()
not_married_man

<=50K    6932
>50K      673
Name: salary, dtype: int64

# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [19]:
max_hours = adult["hours-per-week"].max(), adult["hours-per-week"].value_counts()[adult["hours-per-week"].max()]
max_hours

(99, 78)

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [20]:
adult.corr()

Unnamed: 0.1,Unnamed: 0,age,hours-per-week,salary K$
Unnamed: 0,1.0,-0.001126,-0.00189,0.000129
age,-0.001126,1.0,0.101599,0.208203
hours-per-week,-0.00189,0.101599,1.0,0.196378
salary K$,0.000129,0.208203,0.196378,1.0
