In [8]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [10]:
df = pd.read_csv("../data/adult.csv")
df = df[~df.isin(["?"]).any(axis=1)]
df["salary"] = np.where(df["salary K$"] <= 50, "<=50K", ">50K")
len(df[df["salary K$"] < 51])
len(df[df["salary"] == "<=50K"])

22654

# Task 1
Print the count of men and women in the dataset.

In [5]:
gender_count = df["sex"].value_counts()
print("Count of men and women in the dataset:")
print(gender_count)

Count of men and women in the dataset:
sex
Male      20380
Female     9782
Name: count, dtype: int64


# Task 2
Find the average age of men in dataset

In [6]:
avg_age_men = df[df["sex"] == "Male"]["age"].mean()
print("Average age of men in the dataset:", round(avg_age_men, 2))

Average age of men in the dataset: 39.18


# Task 3
Get the percentage of people from Poland (native-country)

In [8]:
poland_count = len(df[df["native-country"] == "Poland"])
total_count = len(df)
percentage_poland = (poland_count/total_count)*100
print("Percentage of people from Poland in the dataset:", round(percentage_poland, 2), "%")

Percentage of people from Poland in the dataset: 0.19 %


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [10]:
grouped = df.groupby("salary")["age"].agg(["mean", "std"])

print("Mean and standard deviation of age for people who earn more than 50K per year:")
print("Mean:", round(grouped.loc[">50K", "mean"], 2))
print("Standard deviation:", round(grouped.loc[">50K", "std"], 2))

print("Mean and standard deviation of age for people who earn less than or equal to 50K per year:")
print("Mean:", round(grouped.loc["<=50K", "mean"], 2))
print("Standard deviation:", round(grouped.loc["<=50K", "std"], 2))

Mean and standard deviation of age for people who earn more than 50K per year:
Mean: 43.96
Standard deviation: 10.27
Mean and standard deviation of age for people who earn less than or equal to 50K per year:
Mean: 36.61
Standard deviation: 13.46


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [40]:
filtered = df[(~df["education"].isin(["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"])) & (df["salary"] == ">50K")]

print("Number of people without higher education, but with > 50K salary:", len(filtered))

Number of people without higher education, but with > 50K salary: 3178


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [15]:
grouped = df.groupby("education")["age"].describe()
print(grouped)

               count       mean        std   min   25%   50%   75%   max
education                                                               
10th           820.0  37.897561  16.225795  17.0  23.0  36.0  52.0  90.0
11th          1048.0  32.363550  15.089307  17.0  18.0  28.5  43.0  90.0
12th           377.0  32.013263  14.373710  17.0  19.0  28.0  41.0  79.0
1st-4th        151.0  44.622517  14.929051  19.0  33.0  44.0  56.0  81.0
5th-6th        288.0  41.649306  14.754622  17.0  28.0  41.0  53.0  82.0
7th-8th        557.0  47.631957  15.737479  17.0  34.0  49.0  60.0  90.0
9th            455.0  40.303297  15.335754  17.0  28.0  38.0  53.0  90.0
Assoc-acdm    1008.0  37.286706  10.509755  19.0  29.0  36.0  44.0  90.0
Assoc-voc     1307.0  38.246366  11.181253  19.0  30.0  37.0  45.0  84.0
Bachelors     5044.0  38.641554  11.577566  19.0  29.0  37.0  46.0  90.0
Doctorate      375.0  47.130667  11.471727  24.0  39.0  47.0  54.0  80.0
HS-grad       9840.0  38.640955  13.067730  17.0  2

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [18]:
married_men = df[(df["sex"] == "Male") & (df["marital-status"].str.startswith("Married"))]
nonmarried_men = df[(df["sex"] == "Male") & (~df["marital-status"].str.startswith("Married"))]

married_men_salary = married_men.groupby("salary")["salary K$"].mean()
nonmarried_men_salary = nonmarried_men.groupby("salary")["salary K$"].mean()

print("Married men's salaries:")
print(married_men_salary)
print("\nNon-married men's salaries:")
print(nonmarried_men_salary)

Married men's salaries:
salary
<=50K     31.927538
>50K     200.609820
Name: salary K$, dtype: float64

Non-married men's salaries:
salary
<=50K     32.091316
>50K     196.010401
Name: salary K$, dtype: float64


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [59]:
max_hours = df["hours-per-week"].max()
print("Maximum hours worked per week:", max_hours)
max_hours_per_week = df["hours-per-week"].max()
print("There are", df[df["hours-per-week"] == max_hours_per_week].shape[0], "person have", max_hours_per_week, "hours per week")

Maximum hours worked per week: 99
There are 78 person have 99 hours per week


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [12]:
corr_age_salary = df["age"].corr(df["salary K$"])
print(f"Correlation between age and salary: {corr_age_salary}")

Correlation between age and salary: 0.20820286434202898
