In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [21]:
df = pd.read_csv("../data/adult.csv")
df_cleaned = df[~df.apply(lambda row: row.astype(str).str.contains(r"\?").any(), axis=1)]

def is_numeric_column(series):
    numeric_series = pd.to_numeric(series, errors='coerce')
    return numeric_series.isnull().all()

salary_check = is_numeric_column(df_cleaned["salary"])
salary_k_check = is_numeric_column(df_cleaned["salary K$"])


np.True_

# Task 1
Print the count of men and women in the dataset.

In [23]:
df_cleaned["sex"].value_counts()

sex
Male      20380
Female     9782
Name: count, dtype: int64

# Task 2
Find the average age of men in dataset

In [26]:
df_cleaned[df_cleaned["sex"] == "Male"]["age"].mean()

np.float64(39.18400392541707)

# Task 3
Get the percentage of people from Poland (native-country)

In [29]:
people_from_poland = df_cleaned[df_cleaned["native-country"] == "Poland"].shape[0]
people = df_cleaned.shape[0]
(people_from_poland / people) * 100

0.18566408063125786

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [50]:
df_high_salary = df_cleaned.loc[df["salary"] == ">50K"]
mean_age_high_salary = df_high_salary["age"].mean()
std_age_high_salary = df_high_salary["age"].std()

df_low_salary = df.loc[df["salary"] == "<=50K"]
mean_age_low_salary = df_low_salary["age"].mean()
std_age_low_salary = df_low_salary["age"].std()


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [53]:
higher_education = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
mask = (df_cleaned["salary"] == ">50K") & (~df["education"].isin(higher_education))

df_cleaned.loc[mask]

Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
7,7,52,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,>50K,307
10,10,37,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,Black,Male,80,United-States,>50K,116
55,55,43,Private,Some-college,Married-civ-spouse,Tech-support,Husband,White,Male,40,United-States,>50K,341
67,67,53,Private,HS-grad,Married-civ-spouse,Adm-clerical,Wife,White,Female,40,United-States,>50K,225
68,68,49,Self-emp-inc,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,50,United-States,>50K,194
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32462,32462,48,Self-emp-inc,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,50,United-States,>50K,343
32518,32518,57,Local-gov,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,40,United-States,>50K,116
32519,32519,46,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,48,United-States,>50K,239
32557,32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K,173


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [54]:
df.groupby("education")["age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10th,933.0,37.429796,16.720713,17.0,22.0,34.0,52.0,90.0
11th,1175.0,32.355745,15.545485,17.0,18.0,28.0,43.0,90.0
12th,433.0,32.0,14.334625,17.0,19.0,28.0,41.0,79.0
1st-4th,168.0,46.142857,15.615625,19.0,33.0,46.0,57.0,90.0
5th-6th,333.0,42.885886,15.557285,17.0,29.0,42.0,54.0,84.0
7th-8th,646.0,48.44582,16.09235,17.0,34.25,50.0,61.0,90.0
9th,514.0,41.060311,15.946862,17.0,28.0,39.0,54.0,90.0
Assoc-acdm,1067.0,37.381443,11.095177,19.0,29.0,36.0,44.0,90.0
Assoc-voc,1382.0,38.553546,11.6313,19.0,30.0,37.0,46.0,84.0
Bachelors,5355.0,38.904949,11.91221,19.0,29.0,37.0,46.0,90.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [63]:
def categorize_marital_status(status):
    if pd.isna(status):
        return "Unknown"
    if status.startswith("Married"):
        return "Married"
    else:
        return "Not Married"

df_cleaned.loc[:, "marital-status-cat"] = df_cleaned["marital-status"].apply(categorize_marital_status)

married_salary_stats = df_cleaned[(df_cleaned["sex"] == "Male") & (df_cleaned["marital-status-cat"] == "Married")].groupby("salary").size()
not_married_salary_stats = df_cleaned[(df_cleaned["sex"] == "Male") & (df_cleaned["marital-status-cat"] == "Not Married")].groupby("salary").size()


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [65]:
max_hours = df_cleaned["hours-per-week"].max()
df_cleaned[df_cleaned["hours-per-week"] == max_hours].shape[0]

78

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [70]:
df_numeric = pd.get_dummies(df_cleaned)
df_numeric.corr()

Unnamed: 0.1,Unnamed: 0,age,hours-per-week,salary K$,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,...,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,salary_<=50K,salary_>50K,marital-status-cat_Married,marital-status-cat_Not Married,marital-status-cat_Unknown
Unnamed: 0,1.000000,-0.001126,-0.001890,0.000129,0.000275,-0.000933,0.005641,-0.003275,-0.006005,-0.001250,...,-0.002669,0.007442,0.001993,-0.000928,0.004350,-0.006244,0.006244,0.003732,-0.003732,
age,-0.001126,1.000000,0.101599,0.208203,0.056626,0.068256,-0.210491,0.111039,0.150429,0.014805,...,-0.004940,0.007868,0.016259,-0.017775,0.000657,-0.241998,0.241998,0.310553,-0.310553,
hours-per-week,-0.001890,0.101599,1.000000,0.196378,0.005229,0.001612,-0.095533,0.126254,0.087835,-0.032976,...,0.012846,-0.007566,0.010673,-0.010381,0.006983,-0.229480,0.229480,0.221492,-0.221492,
salary K$,0.000129,0.208203,0.196378,1.000000,0.050512,0.025856,-0.098187,0.119222,0.019587,0.002678,...,-0.000891,-0.003442,0.034000,-0.014737,0.008560,-0.853894,0.853894,0.375202,-0.375202,
workclass_Federal-gov,0.000275,0.056626,0.005229,0.050512,1.000000,-0.048726,-0.302170,-0.034519,-0.053993,-0.037803,...,-0.004266,-0.004390,0.017557,-0.000004,-0.004139,-0.057381,0.057397,0.009173,-0.009161,-0.001034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
salary_<=50K,-0.006244,-0.241998,-0.229480,-0.853894,-0.057381,-0.028656,0.117303,-0.137628,-0.025556,-0.009739,...,0.003980,0.007789,-0.040010,0.018217,-0.006716,1.000000,-0.999911,-0.436058,0.436172,-0.010001
salary_>50K,0.006244,0.241998,0.229480,0.853894,0.057397,0.028678,-0.117180,0.137649,0.025580,0.009756,...,-0.003979,-0.007788,0.040259,-0.018214,0.006718,-0.999911,1.000000,0.436151,-0.436112,-0.003315
marital-status-cat_Married,0.003732,0.310553,0.221492,0.375202,0.009173,0.011390,-0.125525,0.104312,0.118044,-0.006579,...,-0.000412,0.001014,-0.030345,-0.006739,0.012485,-0.436058,0.436151,1.000000,-0.999934,-0.005524
marital-status-cat_Not Married,-0.003732,-0.310553,-0.221492,-0.375202,-0.009161,-0.011372,0.125636,-0.104299,-0.118024,0.006593,...,0.000414,-0.001013,0.030558,0.006742,-0.012483,0.436172,-0.436112,-0.999934,1.000000,-0.006002
