In [3]:
import numpy as np
import pandas as pd

from pathlib import Path


# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [4]:
# read dataset from csv file
base_dir = Path.cwd().parent
data_file_path = base_dir/"data"/"adult.csv"
absolute_data_file_path = data_file_path.resolve()

data = pd.read_csv(absolute_data_file_path)

# remove all rows, which contains "?"
data = data[~data.map(lambda x: "?" in str(x)).any(axis=1)]

# check for data correctness (salary & salary $K)
condition_1 = (data["salary"] == "<=50K") & (data["salary K$"] <= 50)
condition_2 = (data["salary"] == ">50K") & (data["salary K$"] > 50)
correct = condition_1 | condition_2
incorrect = len(data) - correct.sum()
print(f"Number of incorrect values: {incorrect}")


Number of incorrect values: 0


# Task 1
Print the count of men and women in the dataset.

In [46]:
gender_counts = data["sex"].value_counts()

men = gender_counts.get("Male", 0)
women = gender_counts.get("Female", 0)

print(f"Number of men: {men}")
print(f"Number of women: {women}")

Number of men: 20380
Number of women: 9782


# Task 2
Find the average age of men in dataset

In [28]:
# write your code for task 2 here
males_only = data[data["sex"] == "Male"]
average_age_men = males_only["age"].mean()
print(f"The average age of men is: {average_age_men:.2f}")


The average age of men is: 39.18


# Task 3
Get the percentage of people from Poland (native-country)

In [29]:
# write your code for task 3 here
countries = data["native-country"].value_counts()
poland_percentage = countries["Poland"] / len(data) * 100
print(f"The percentage of people from Poland: {poland_percentage:.2f}%")


The percentage of people from Poland: 0.19%


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [44]:
# write your code for task 4 here
more_50K = data[data["salary"] == ">50K"]
mean_age_more_50K = more_50K["age"].mean()
print(f"The mean of the age who earn > 50K: {mean_age_more_50K:.1f}")
std_age_more_50K = more_50K["age"].std()
print(f"The standard deviation of the age who earn > 50K: {std_age_more_50K:.1f}")

less_50K = data[data["salary"] == "<=50K"]
mean_age_less_50K = less_50K["age"].mean()
print(f"The mean of the age who earn <= 50K: {mean_age_less_50K:.1f}")
std_age_less_50K = less_50K["age"].std()
print(f"The standard deviation of the age who earn <= 50K: {std_age_less_50K:.1f}")


The mean of the age who earn > 50K: 44.0
The standard deviation of the age who earn > 50K: 10.3
The mean of the age who earn <= 50K: 36.6
The standard deviation of the age who earn <= 50K: 13.5


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [41]:
# write your code for task 5 here
high_education = [
    "Bachelors",
    "Prof-school",
    "Assoc-acdm",
    "Assoc-voc",
    "Masters",
    "Doctorate"
]
not_high_education = data[~data["education"].isin(high_education)]
more_50K_salary = not_high_education[not_high_education["salary"] == ">50K"]
print(more_50K_salary)


       Unnamed: 0  age         workclass     education      marital-status  \
7               7   52  Self-emp-not-inc       HS-grad  Married-civ-spouse   
10             10   37           Private  Some-college  Married-civ-spouse   
55             55   43           Private  Some-college  Married-civ-spouse   
67             67   53           Private       HS-grad  Married-civ-spouse   
68             68   49      Self-emp-inc  Some-college  Married-civ-spouse   
...           ...  ...               ...           ...                 ...   
32462       32462   48      Self-emp-inc       HS-grad  Married-civ-spouse   
32518       32518   57         Local-gov       HS-grad  Married-civ-spouse   
32519       32519   46           Private  Some-college  Married-civ-spouse   
32557       32557   40           Private       HS-grad  Married-civ-spouse   
32560       32560   52      Self-emp-inc       HS-grad  Married-civ-spouse   

              occupation relationship   race     sex  hours-per

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [None]:
# write your code for task 6 here

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [5]:
# write your code for task 7 here

# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [6]:
# write your code for task 8 here

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [7]:
# write your code for task 9 here