In [70]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [71]:
df = pd.read_csv(filepath_or_buffer="../data/adult.csv")
df = df.drop(columns=["Unnamed: 0"])

In [72]:
df = df[df["workclass"] != "?"]
df = df[df["occupation"] != "?"]
df = df[df["native-country"] != "?"]
salary_under_50K = df[(df["salary"] == "<=50K") & (df["salary K$"] > 50)]
salary_above_50K = df[(df["salary"] == ">50K") & (df["salary K$"] <= 50)]
print(f"Total amount of incorrect data for salary and salary $K columns are {len(salary_under_50K) + len(salary_above_50K)}")

Total amount of incorrect data for salary and salary $K columns are 0


# Task 1
Print the count of men and women in the dataset.

In [73]:
df["sex"].value_counts()

Male      20380
Female     9782
Name: sex, dtype: int64

# Task 2
Find the average age of men in dataset

In [74]:
round(df[df["sex"] == "Male"]["age"].mean(), 2)

39.18

# Task 3
Get the percentage of people from Poland (native-country)

In [75]:
total_people = len(df["native-country"])
people_from_poland = len(df[df["native-country"] == "Poland"])
result = round(people_from_poland / total_people * 100, 2)
print(f"The percentage of people from Poland is {result}")

The percentage of people from Poland is 0.19


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [76]:
average_age_under_50 = round(df[df["salary K$"] <= 50]["age"].mean(), 2)
standard_deviation_under_50 = round(df[df["salary K$"] <= 50]["age"].std(), 2)
average_age_over_50 = round(df[df["salary K$"] > 50]["age"].mean(), 2)
standard_deviation_over_50 = round(df[df["salary K$"] > 50]["age"].std(), 2)
print(f"The average age for people who earn less then 50K per year is '{average_age_under_50}', standard deviation is '{standard_deviation_under_50}'")
print(f"The average age for people who earn more then 50K per year is '{average_age_over_50}', standard deviation is '{standard_deviation_over_50}'")

The average age for people who earn less then 50K per year is '36.61', standard deviation is '13.46'
The average age for people who earn more then 50K per year is '43.96', standard deviation is '10.27'


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [77]:
education = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
people_without_higher_education = df[(df["salary"] == ">50K") & (~df["education"].isin(education))]
print(f"The amount of people without higher education are '{len(people_without_higher_education)}'")

The amount of people without higher education are '3178'


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [78]:
df.groupby("education")["age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10th,820.0,37.897561,16.225795,17.0,23.0,36.0,52.0,90.0
11th,1048.0,32.36355,15.089307,17.0,18.0,28.5,43.0,90.0
12th,377.0,32.013263,14.37371,17.0,19.0,28.0,41.0,79.0
1st-4th,151.0,44.622517,14.929051,19.0,33.0,44.0,56.0,81.0
5th-6th,288.0,41.649306,14.754622,17.0,28.0,41.0,53.0,82.0
7th-8th,557.0,47.631957,15.737479,17.0,34.0,49.0,60.0,90.0
9th,455.0,40.303297,15.335754,17.0,28.0,38.0,53.0,90.0
Assoc-acdm,1008.0,37.286706,10.509755,19.0,29.0,36.0,44.0,90.0
Assoc-voc,1307.0,38.246366,11.181253,19.0,30.0,37.0,45.0,84.0
Bachelors,5044.0,38.641554,11.577566,19.0,29.0,37.0,46.0,90.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [79]:
married_over_50K = df[(df["salary"] == ">50K") & df["marital-status"].str.startswith("Married")]
married_under_50K = df[(df["salary"] == "<=50K") & df["marital-status"].str.startswith("Married")]
not_married_over_50K = df[(df["salary"] == ">50K") & ~ df["marital-status"].str.startswith("Married")]
not_married_under_50K = df[(df["salary"] == "<=50K") & ~ df["marital-status"].str.startswith("Married")]
print(f"Total amount of 'Married' men, who earns more then '50K' is {(len(married_over_50K))}, less then '50K' - {len(married_under_50K)}")
print(f"Total amount of 'NOT Married' men, who earns more then '50K' is {len(not_married_over_50K)}, less then '50K' - {len(not_married_under_50K)}")

Total amount of 'Married' men, who earns more then '50K' is 6440, less then '50K' - 8016
Total amount of 'NOT Married' men, who earns more then '50K' is 1068, less then '50K' - 14638


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [80]:
max_hors_per_week = df["hours-per-week"].max()
print(f"The max hours per week are {max_hors_per_week}")
df["hours-per-week"].value_counts().sort_index()
df.head(2)


The max hours per week are 99


Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K,39
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K,35


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [81]:
correlation_data = df[["age", "hours-per-week", "salary K$"]].corr()
print(correlation_data)
print()
print("Correlation between data in the dataset shows that salary has a weak dependence on 'hours-per-week' (0.19%) and 'age' (0.20%)."
      " Correlation values of 0.2 and below indicate a low dependency of salary on these columns."
      "Based on this, we can assume that education is the main factor influencing the salary.")

                     age  hours-per-week  salary K$
age             1.000000        0.101599   0.208203
hours-per-week  0.101599        1.000000   0.196378
salary K$       0.208203        0.196378   1.000000

Correlation between data in the dataset shows that salary has a weak dependence on 'hours-per-week' (0.19%) and 'age' (0.20%). Correlation values of 0.2 and below indicate a low dependency of salary on these columns.Based on this, we can assume that education is the main factor influencing the salary.
