In [39]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [6]:
df = pd.read_csv("../data/adult.csv")

# Task 1
Print the count of men and women in the dataset.

In [11]:
gender_counts = df["sex"].value_counts()
print(gender_counts)

Male      21790
Female    10771
Name: sex, dtype: int64


# Task 2
Find the average age of men in dataset

In [12]:
men_df = df[df["sex"] == "Male"]
average_age_men = round(men_df['age'].mean())
print("Average age of men:", average_age_men)

Average age of men: 39


# Task 3
Get the percentage of people from Poland (native-country)

In [19]:
poland_count = df[df['native-country'] == 'Poland'].shape[0]
total_count = df.shape[0]
percentage_poland = round((poland_count / total_count) * 100, 2)
print("Percentage of people from Poland:", percentage_poland)

Percentage of people from Poland: 0.18


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [21]:
high_income_age_mean = round(df[df['salary'] == '>50K']['age'].mean(), 2)
high_income_age_std = round(df[df['salary'] == '>50K']['age'].std(), 2)

low_income_age_mean = round(df[df['salary'] == '<=50K']['age'].mean(), 2)
low_income_age_std = round(df[df['salary'] == '<=50K']['age'].std(), 2)

print("Mean age for people earning > 50K:", high_income_age_mean)
print("Standard deviation of age for people earning > 50K:", high_income_age_std)

print("Mean age for people earning <= 50K:", low_income_age_mean)
print("Standard deviation of age for people earning <= 50K:", low_income_age_std)

Mean age for people earning > 50K: 44.25
Standard deviation of age for people earning > 50K: 10.52
Mean age for people earning <= 50K: 36.78
Standard deviation of age for people earning <= 50K: 14.02


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [22]:
filtered_df = df[(df["salary"] == ">50K") & (~df["education"].isin(["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]))]
print(filtered_df)

       Unnamed: 0  age         workclass     education      marital-status  \
7               7   52  Self-emp-not-inc       HS-grad  Married-civ-spouse   
10             10   37           Private  Some-college  Married-civ-spouse   
27             27   54                 ?  Some-college  Married-civ-spouse   
38             38   31           Private  Some-college  Married-civ-spouse   
55             55   43           Private  Some-college  Married-civ-spouse   
...           ...  ...               ...           ...                 ...   
32510       32510   39           Private       HS-grad  Married-civ-spouse   
32518       32518   57         Local-gov       HS-grad  Married-civ-spouse   
32519       32519   46           Private  Some-college  Married-civ-spouse   
32557       32557   40           Private       HS-grad  Married-civ-spouse   
32560       32560   52      Self-emp-inc       HS-grad  Married-civ-spouse   

              occupation relationship                race     s

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [23]:
education_stats = df.groupby("education")["age"].describe()

print(education_stats)

                count       mean        std   min    25%   50%   75%   max
education                                                                 
10th            933.0  37.429796  16.720713  17.0  22.00  34.0  52.0  90.0
11th           1175.0  32.355745  15.545485  17.0  18.00  28.0  43.0  90.0
12th            433.0  32.000000  14.334625  17.0  19.00  28.0  41.0  79.0
1st-4th         168.0  46.142857  15.615625  19.0  33.00  46.0  57.0  90.0
5th-6th         333.0  42.885886  15.557285  17.0  29.00  42.0  54.0  84.0
7th-8th         646.0  48.445820  16.092350  17.0  34.25  50.0  61.0  90.0
9th             514.0  41.060311  15.946862  17.0  28.00  39.0  54.0  90.0
Assoc-acdm     1067.0  37.381443  11.095177  19.0  29.00  36.0  44.0  90.0
Assoc-voc      1382.0  38.553546  11.631300  19.0  30.00  37.0  46.0  84.0
Bachelors      5355.0  38.904949  11.912210  19.0  29.00  37.0  46.0  90.0
Doctorate       413.0  47.702179  11.784716  24.0  39.00  47.0  55.0  80.0
HS-grad       10501.0  38

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [42]:
married_men = df[(df["sex"] == "Male") & (df["marital-status"].str.startswith("Married"))]
non_married_men = df[(df["sex"] == "Male") & (~df["marital-status"].str.startswith("Married"))]

married_men_salary = married_men['salary'].value_counts()
unmarried_men_salary = non_married_men['salary'].value_counts()
print("for married men:")
print(married_men_salary)
print("for non-married men:")
print(unmarried_men_salary)

for married men:
<=50K    7576
>50K     5965
Name: salary, dtype: int64
for non-married men:
<=50K    7552
>50K      697
Name: salary, dtype: int64


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [27]:
max_hours_per_week = df["hours-per-week"].max()

same_hours_count = df[df["hours-per-week"] == max_hours_per_week].shape[0]

print("Maximum hours per week:", max_hours_per_week)
print("Number of people working the same hours per week:", same_hours_count)

Maximum hours per week: 99
Number of people working the same hours per week: 85


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [40]:
df["is_male"] = np.where(df["sex"] == "Male", True, False)
df["is_female"] = np.where(df["sex"] == "Female", True, False)
df.corr(numeric_only=True)

Unnamed: 0.1,Unnamed: 0,age,hours-per-week,salary K$,is_male,is_female
Unnamed: 0,1.0,0.001286,0.000607,-0.001666,-0.002472,0.002472
age,0.001286,1.0,0.068756,0.201774,0.088832,-0.088832
hours-per-week,0.000607,0.068756,1.0,0.196916,0.229309,-0.229309
salary K$,-0.001666,0.201774,0.196916,1.0,0.182528,-0.182528
is_male,-0.002472,0.088832,0.229309,0.182528,1.0,-1.0
is_female,0.002472,-0.088832,-0.229309,-0.182528,-1.0,1.0
