In [None]:
import pandas as pd
import numpy as np

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [None]:
data = pd.read_csv("data/adult.csv")
df = pd.DataFrame(data)
df = df[~df.isin(["?"]).any(axis=1)]
inconsistent_salaries = (df["salary"] == "<=50K") & (df["salary K$"] > 50)
inconsistent_salaries |= (df["salary"] == ">50K") & (df["salary K$"] <= 50)

num_inconsistent = len(df[inconsistent_salaries])
print("Number of inconsistent rows:", num_inconsistent)

# Task 1
Print the count of men and women in the dataset.

In [None]:
count_men_women = df["sex"].value_counts()

print(count_men_women)


# Task 2
Find the average age of men in dataset

In [110]:
average_age = df[df["sex"] == "Male"]["age"].mean()
print(f"Average age of men in dataset {average_age}")

Average age of men in dataset 39.18400392541707


# Task 3
Get the percentage of people from Poland (native-country)

In [111]:
polish_percentage = (df[df["native-country"] == "Poland"].shape[0] / df.shape[0]) * 100

print(f"The percentage of people from Poland is {polish_percentage}")

The percentage of people from Poland is 0.18566408063125786


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [112]:
high_earning_mean = df[df["salary"] == ">50K"]["age"].mean()
high_earning_std = df[df["salary"] == ">50K"]["age"].std()


low_earning_mean = df[df["salary"] == "<=50K"]["age"].mean()
low_earning_std = df[df["salary"] == "<=50K"]["age"].std()

print(f"Mean age for high earners: {high_earning_mean}")
print(f"Standard deviation of age for high earners {high_earning_std}")
print(f"Mean age for low earners: {low_earning_mean}")
print(f"Standard deviation of age for low earners: {low_earning_std}")

Mean age for high earners: 43.95911028236548
Standard deviation of age for high earners 10.269632835673852
Mean age for low earners: 36.60806038668668
Standard deviation of age for low earners: 13.464631257161633


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [113]:
has_high_income_no_high_education = df[(~df["education"].isin(["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"])) & (df["salary"] == ">50K")]
has_high_income_no_high_education.head()

Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
7,7,52,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,>50K,307
10,10,37,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,Black,Male,80,United-States,>50K,116
55,55,43,Private,Some-college,Married-civ-spouse,Tech-support,Husband,White,Male,40,United-States,>50K,341
67,67,53,Private,HS-grad,Married-civ-spouse,Adm-clerical,Wife,White,Female,40,United-States,>50K,225
68,68,49,Self-emp-inc,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,50,United-States,>50K,194


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [114]:
age_stats_by_education = df.groupby("education")["age"].describe()
print(age_stats_by_education)

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [115]:
married_low_income = df[(df["marital-status"].str.startswith("Married")) & (df["salary"] == "<=50K")]

married_high_income = df[(df["marital-status"].str.startswith("Married")) & (df["salary"] == ">50K")]

unmarried_low_income = df[(df["marital-status"].str.startswith("Married") == False) & (df["salary"] == "<=50K")]

unmarried_high_income = df[(df["marital-status"].str.startswith("Married") == False) & (df["salary"] == ">50K")]

avg_salary_married_low_income = married_low_income["salary K$"].mean()
avg_salary_married_high_income = married_high_income["salary K$"].mean()
avg_salary_unmarried_low_income = unmarried_low_income["salary K$"].mean()
avg_salary_unmarried_high_income = unmarried_high_income["salary K$"].mean()

if avg_salary_married_high_income > avg_salary_unmarried_high_income:
    print("Married men earn more than non-married men with high income.")
else:
    print("Non-married men earn more than married men with high income.")
if avg_salary_married_low_income > avg_salary_unmarried_low_income:
    print("Married men with low income earn more than non-married men with low income.")
else:
    print("Non-married men with low income earn more than married men with low income.")

Married men earn more than non-married men with high income.
Non-married men with low income earn more than married men with low income.


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [116]:
max_hours_per_week = df["hours-per-week"].max()
num_people_same_hours = df[df["hours-per-week"] == max_hours_per_week].shape[0]
print("Maximum hours per week:", max_hours_per_week)
print("Number of people working the same amount of hours:", num_people_same_hours)

Maximum hours per week: 99
Number of people working the same amount of hours: 78


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [117]:
df["gender"] = np.where(df["sex"] == "Male", 1, 0)
df_numeric = df.select_dtypes(include=["int64", "float64"])
df_numeric["gender"] = np.where(df["sex"] == "Male", 1, 0)
print(df_numeric.corr())


                Unnamed: 0       age  hours-per-week  salary K$    gender
Unnamed: 0        1.000000 -0.001126       -0.001890   0.000080 -0.001405
age              -0.001126  1.000000        0.101599   0.208203  0.081993
hours-per-week   -0.001890  0.101599        1.000000   0.196377  0.231268
salary K$         0.000080  0.208203        0.196377   1.000000  0.182662
gender           -0.001405  0.081993        0.231268   0.182662  1.000000


In [118]:
print("Overall, the correlations between these columns are relatively weak.")

Overall, the correlations between these columns are relatively weak.
