In [2]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [14]:
data = pd.read_csv('adult.csv')
data = data.replace('?', np.nan)
data.info

<bound method DataFrame.info of        Unnamed: 0  age         workclass   education      marital-status  \
0               0   39         State-gov   Bachelors       Never-married   
1               1   50  Self-emp-not-inc   Bachelors  Married-civ-spouse   
2               2   38           Private     HS-grad            Divorced   
3               3   53           Private        11th  Married-civ-spouse   
4               4   28           Private   Bachelors  Married-civ-spouse   
...           ...  ...               ...         ...                 ...   
32556       32556   27           Private  Assoc-acdm  Married-civ-spouse   
32557       32557   40           Private     HS-grad  Married-civ-spouse   
32558       32558   58           Private     HS-grad             Widowed   
32559       32559   22           Private     HS-grad       Never-married   
32560       32560   52      Self-emp-inc     HS-grad  Married-civ-spouse   

              occupation   relationship   race     sex 

# Task 1
Print the count of men and women in the dataset.

In [15]:
data["sex"].value_counts()

sex
Male      21790
Female    10771
Name: count, dtype: int64

# Task 2
Find the average age of men in dataset

In [17]:
data[data["sex"] == "Male"]["age"].mean()

np.float64(39.43354749885268)

# Task 3
Get the percentage of people from Poland (native-country)

In [22]:
total_people = len(data)
poland_count = data[data["native-country"] == "Poland"].shape[0]
(poland_count / total_people) * 100

0.18426952489174164

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [24]:
data[data["salary"] == ">50K"]["age"].agg(["mean", "std"])
data[data["salary"] == "<=50K"]["age"].agg(["mean", "std"])



mean    36.783738
std     14.020088
Name: age, dtype: float64

# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [26]:
higher_education = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
data[(~data["education"].isin(higher_education)) & (data["salary"] == ">50K")]

Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
7,7,52,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,>50K,307
10,10,37,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,Black,Male,80,United-States,>50K,116
27,27,54,,Some-college,Married-civ-spouse,,Husband,Asian-Pac-Islander,Male,60,South,>50K,275
38,38,31,Private,Some-college,Married-civ-spouse,Sales,Husband,White,Male,38,,>50K,166
55,55,43,Private,Some-college,Married-civ-spouse,Tech-support,Husband,White,Male,40,United-States,>50K,341
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32510,32510,39,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,45,,>50K,212
32518,32518,57,Local-gov,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,40,United-States,>50K,116
32519,32519,46,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,48,United-States,>50K,239
32557,32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K,173


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [27]:
data.groupby("education")["age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10th,933.0,37.429796,16.720713,17.0,22.0,34.0,52.0,90.0
11th,1175.0,32.355745,15.545485,17.0,18.0,28.0,43.0,90.0
12th,433.0,32.0,14.334625,17.0,19.0,28.0,41.0,79.0
1st-4th,168.0,46.142857,15.615625,19.0,33.0,46.0,57.0,90.0
5th-6th,333.0,42.885886,15.557285,17.0,29.0,42.0,54.0,84.0
7th-8th,646.0,48.44582,16.09235,17.0,34.25,50.0,61.0,90.0
9th,514.0,41.060311,15.946862,17.0,28.0,39.0,54.0,90.0
Assoc-acdm,1067.0,37.381443,11.095177,19.0,29.0,36.0,44.0,90.0
Assoc-voc,1382.0,38.553546,11.6313,19.0,30.0,37.0,46.0,84.0
Bachelors,5355.0,38.904949,11.91221,19.0,29.0,37.0,46.0,90.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [30]:
data["is_married_male"] = (data["marital-status"].str.startswith("Married")) & (data["sex"] == "Male")
data.groupby(["is_married_male", "salary"]).size().unstack(fill_value=0)

salary,<=50K,>50K
is_married_male,Unnamed: 1_level_1,Unnamed: 2_level_1
False,17144,1876
True,7576,5965


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [34]:
max_hours = data["hours-per-week"].max()
data[data["hours-per-week"] == max_hours].shape[0]

85

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [40]:
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns].corr()

Unnamed: 0.1,Unnamed: 0,age,hours-per-week,salary K$
Unnamed: 0,1.0,0.001286,0.000607,-0.001666
age,0.001286,1.0,0.068756,0.201774
hours-per-week,0.000607,0.068756,1.0,0.196916
salary K$,-0.001666,0.201774,0.196916,1.0


In [42]:
numeric_columns = data.select_dtypes(include=['number']).columns

data[numeric_columns].corr()

Unnamed: 0.1,Unnamed: 0,age,hours-per-week,salary K$
Unnamed: 0,1.0,0.001286,0.000607,-0.001666
age,0.001286,1.0,0.068756,0.201774
hours-per-week,0.000607,0.068756,1.0,0.196916
salary K$,-0.001666,0.201774,0.196916,1.0
