In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [3]:
data = pd.read_csv("../data/adult.csv", index_col=[0])
data.replace("?", np.nan)
data.dropna()
print(data.loc[(data["salary"] == ">50K") & (data["salary K$"] <= 50)])
print(data.loc[(data["salary"] == "<=50K") & (data["salary K$"] > 50)])

Empty DataFrame
Columns: [age, workclass, education, marital-status, occupation, relationship, race, sex, hours-per-week, native-country, salary, salary K$]
Index: []
Empty DataFrame
Columns: [age, workclass, education, marital-status, occupation, relationship, race, sex, hours-per-week, native-country, salary, salary K$]
Index: []


# Task 1
Print the count of men and women in the dataset.

In [4]:
men_count = data[data["sex"] == "Male"].shape[0]
women_count = data[data["sex"] == "Female"].shape[0]

print("Count of men:", men_count)
print("Count of women:", women_count)

Count of men: 21790
Count of women: 10771


# Task 2
Find the average age of men in dataset

In [5]:
men = data[data["sex"] == "Male"]
men["age"].mean().round()

39.0

# Task 3
Get the percentage of people from Poland (native-country)

In [6]:
people = data[data["native-country"] == "Poland"].shape[0] * 100 / data.shape[0]
print(f"Percentage of people from Poland: {people:.2f}%")

Percentage of people from Poland: 0.18%


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [7]:
result = data.groupby("salary")["age"].agg(["mean", "std"]).round(2).reset_index()
result

Unnamed: 0,salary,mean,std
0,<=50K,36.78,14.02
1,>50K,44.25,10.52


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [9]:
higher_education = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
higher_salary = data[data["salary"] == ">50K"]
higher_salary[~higher_salary["education"].isin(higher_education)].shape[0]

3306

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [None]:
data.groupby("education")["age"].describe()

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [None]:
married = data[data["marital-status"].str.startswith("Married")]
unmarried = data[~data["marital-status"].str.startswith("Married")]
married_high_salary = married[married["salary"] == ">50K"].shape[0] * 100 / married.shape[0]
unmarried_high_salary = unmarried[unmarried["salary"] == ">50K"].shape[0] * 100 / unmarried.shape[0]

plt.subplot(1, 2, 1)
labels_1 = [">50K", "<=50k"]
people_share_1 = [married_high_salary, 100 - married_high_salary]
plt.pie(people_share_1, labels=labels_1, autopct="%1.1f%%", startangle=60)
plt.title("Salary of married people")

plt.subplot(1, 2, 2)
labels_2 = [">50K", "<=50k"]
people_share_2 = [unmarried_high_salary, 100 - unmarried_high_salary]
plt.pie(people_share_2, labels=labels_2, autopct="%1.1f%%", startangle=60)
plt.title("Salary of unmarried people")

plt.show()

# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [None]:
max_hours = data["hours-per-week"].max()
hard_workers = data[data["hours-per-week"] == max_hours].shape[0]
print(f"Max hours per week some person works - {max_hours}")
print(f"People works max hours per week - {hard_workers}")

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [None]:
data["marital-status"] = data["marital-status"].replace({"^Married.*": 1, ".*": 0}, regex=True)
data["sex"] = data["sex"].replace({"Female": 1, "Male": 0}, regex=True)
selected_columns = ["age", "marital-status", "sex", "hours-per-week", "salary K$"]
correlation_matrix = data[selected_columns].corr()
print("There is a weak dependence between the indicators.\n"
      "The most independent variables are hours-per-week and age.\n"
      "There is a positive correlation of 0.37 between marital-status and salary\n"
      "This may indicate higher wages for married people")
correlation_matrix