In [20]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [21]:
df = pd.read_csv("../data/adult.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
0,0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K,39
1,1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K,35
2,2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K,27
3,3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K,43
4,4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K,25


In [22]:
df = df.drop(["Unnamed: 0", "salary"], axis=1)


df.replace("?", np.nan, inplace=True)
df.dropna(axis=0, inplace=True)

df["salary_is_over_50k"] = df["salary K$"] > 50

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30162 entries, 0 to 32560
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   age                 30162 non-null  int64 
 1   workclass           30162 non-null  object
 2   education           30162 non-null  object
 3   marital-status      30162 non-null  object
 4   occupation          30162 non-null  object
 5   relationship        30162 non-null  object
 6   race                30162 non-null  object
 7   sex                 30162 non-null  object
 8   hours-per-week      30162 non-null  int64 
 9   native-country      30162 non-null  object
 10  salary K$           30162 non-null  int64 
 11  salary_is_over_50k  30162 non-null  bool  
dtypes: bool(1), int64(3), object(8)
memory usage: 2.8+ MB


# Task 1
Print the count of men and women in the dataset.

In [23]:
men = len(df[df["sex"]  == "Male"])
women = len(df[df["sex"] == "Female"])
print(f"Men in dataset: {men}")
print(f"Women in dataset: {women}")

Men in dataset: 20380
Women in dataset: 9782


# Task 2
Find the average age of men in dataset

In [24]:
df.groupby("sex")["age"].mean()

sex
Female    36.883459
Male      39.184004
Name: age, dtype: float64

# Task 3
Get the percentage of people from Poland (native-country)

In [25]:
polish_percent = len(df[df["native-country"] == "Poland"])/len(df)
print(f"Percentage of people from Poland: {polish_percent:.2%}")

Percentage of people from Poland: 0.19%


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [26]:
over50k_age_mean = df[df["salary_is_over_50k"] == True]["age"].mean()
over50k_age_std = df[df["salary_is_over_50k"] == True]["age"].std()
print(f"Mean age of people who earn over 50k: {over50k_age_mean:.2f}")
print(f"Standart deviation of age of people who earn over 50k: {over50k_age_std:.2f}")

Mean age of people who earn over 50k: 43.96
Standart deviation of age of people who earn over 50k: 10.27


In [27]:
less50k_age_mean = df[df["salary_is_over_50k"] == False]["age"].mean()
less50k_age_std = df[df["salary_is_over_50k"] == False]["age"].std()
print(f"Mean age of people who earn 50k and less: {less50k_age_mean:.2f}")
print(f"Standart deviation of age of people who earn over 50k: {less50k_age_std:.2f}")

Mean age of people who earn 50k and less: 36.61
Standart deviation of age of people who earn over 50k: 13.46


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [28]:
df["education"].unique()

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', '7th-8th', 'Doctorate', 'Assoc-voc', 'Prof-school',
       '5th-6th', '10th', 'Preschool', '12th', '1st-4th'], dtype=object)

In [29]:
higher_education = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
df[~df["education"].isin(higher_education) & df["salary_is_over_50k"] == True]

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary K$,salary_is_over_50k
7,52,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,307,True
10,37,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,Black,Male,80,United-States,116,True
55,43,Private,Some-college,Married-civ-spouse,Tech-support,Husband,White,Male,40,United-States,341,True
67,53,Private,HS-grad,Married-civ-spouse,Adm-clerical,Wife,White,Female,40,United-States,225,True
68,49,Self-emp-inc,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,50,United-States,194,True
...,...,...,...,...,...,...,...,...,...,...,...,...
32462,48,Self-emp-inc,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,50,United-States,343,True
32518,57,Local-gov,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,40,United-States,116,True
32519,46,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,48,United-States,239,True
32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,173,True


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [30]:
df.groupby("education")["age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10th,820.0,37.897561,16.225795,17.0,23.0,36.0,52.0,90.0
11th,1048.0,32.36355,15.089307,17.0,18.0,28.5,43.0,90.0
12th,377.0,32.013263,14.37371,17.0,19.0,28.0,41.0,79.0
1st-4th,151.0,44.622517,14.929051,19.0,33.0,44.0,56.0,81.0
5th-6th,288.0,41.649306,14.754622,17.0,28.0,41.0,53.0,82.0
7th-8th,557.0,47.631957,15.737479,17.0,34.0,49.0,60.0,90.0
9th,455.0,40.303297,15.335754,17.0,28.0,38.0,53.0,90.0
Assoc-acdm,1008.0,37.286706,10.509755,19.0,29.0,36.0,44.0,90.0
Assoc-voc,1307.0,38.246366,11.181253,19.0,30.0,37.0,45.0,84.0
Bachelors,5044.0,38.641554,11.577566,19.0,29.0,37.0,46.0,90.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [31]:
mens = df[df["sex"] == "Male"]

married_over_50k = len(mens[mens["marital-status"].str.startswith("Married") & mens["salary_is_over_50k"] == True])
not_married_over_50k = len(mens[~mens["marital-status"].str.startswith("Married") & mens["salary_is_over_50k"] == True])
print(f"Married who earns over 50k: {married_over_50k},\nNot-married who earns over 50k: {not_married_over_50k}")

Married who earns over 50k: 5723,
Not-married who earns over 50k: 673


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [32]:
max_hours = df["hours-per-week"].max()
people_long_work_week = len(df[df["hours-per-week"] == max_hours])
print(f"Max week working hours: {max_hours},\nNumber of people working that hard: {people_long_work_week}")

Max week working hours: 99,
Number of people working that hard: 78


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [34]:
df["race"].unique()

array(['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo',
       'Other'], dtype=object)

In [40]:
# making White -> 0, Black -> 1, Asian-Pac-Islander -> 2, Amer-Indian-Eskimo -> 3, Other -> 4
df["race_category"] = df["race"].map({
    "White": 0, "Black": 1, "Asian-Pac-Islander": 2, "Amer-Indian-Eskimo": 3, "Other": 4
})
df

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary K$,salary_is_over_50k,sex_category,race_category
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,39,False,1,0
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,35,False,1,0
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,27,False,1,0
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,43,False,1,1
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,25,False,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,38,United-States,36,False,0,0
32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,173,True,1,0
32558,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States,40,False,0,0
32559,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,20,United-States,38,False,1,0


In [35]:
df.sex.unique()

array(['Male', 'Female'], dtype=object)

In [39]:
# making male -> 1, female -> 0

df["sex_category"] = np.where(df["sex"] == "Male", 1, 0)
df

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary K$,salary_is_over_50k,sex_category
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,39,False,1
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,35,False,1
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,27,False,1
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,43,False,1
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,25,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,38,United-States,36,False,0
32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,173,True,1
32558,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States,40,False,0
32559,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,20,United-States,38,False,1


In [42]:
# dropping colums "sex" and "race"
df.drop(["sex", "race"], axis=1, inplace=True)
df

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,hours-per-week,native-country,salary K$,salary_is_over_50k,sex_category,race_category
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,40,United-States,39,False,1,0
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,13,United-States,35,False,1,0
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,40,United-States,27,False,1,0
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,40,United-States,43,False,1,1
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,40,Cuba,25,False,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,38,United-States,36,False,0,0
32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,40,United-States,173,True,1,0
32558,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,40,United-States,40,False,0,0
32559,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,20,United-States,38,False,1,0


In [43]:
df.corr(numeric_only=True)

Unnamed: 0,age,hours-per-week,salary K$,salary_is_over_50k,sex_category,race_category
age,1.0,0.101599,0.208203,0.241998,0.081993,-0.035019
hours-per-week,0.101599,1.0,0.196378,0.22948,0.231268,-0.037586
salary K$,0.208203,0.196378,1.0,0.853894,0.182642,-0.057945
salary_is_over_50k,0.241998,0.22948,0.853894,1.0,0.216699,-0.067903
sex_category,0.081993,0.231268,0.182642,0.216699,1.0,-0.067529
race_category,-0.035019,-0.037586,-0.057945,-0.067903,-0.067529,1.0


In [49]:
df.corr(method="spearman", numeric_only=True)


Unnamed: 0,age,hours-per-week,salary K$,salary_is_over_50k,sex_category,race_category
age,1.0,0.155916,0.21343,0.276778,0.09174,-0.024833
hours-per-week,0.155916,1.0,0.199713,0.267245,0.264989,-0.083612
salary K$,0.21343,0.199713,1.0,0.74905,0.161977,-0.062645
salary_is_over_50k,0.276778,0.267245,0.74905,1.0,0.216699,-0.082643
sex_category,0.09174,0.264989,0.161977,0.216699,1.0,-0.100966
race_category,-0.024833,-0.083612,-0.062645,-0.082643,-0.100966,1.0


Here we can see small straight correlations between sex and hours-per-week, also with salary_is_over_50k.
Also, we can see some straight correalations between age and  salary_is_over_50k.