In [7]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [70]:
df = pd.read_csv("../data/adult.csv", header=None)
df.columns = ["index", "age", "workclass", "education", "material-status", "occupation", "relationship", "race", "sex", "hours-per-week", "native-country", "salary", "salary K$"]
df = df.replace("?", np.nan)
df = df.dropna()

df["salary K$"] = pd.to_numeric(df["salary K$"])

df = df[((df["salary"] == "<=50K") & (df["salary K$"] <= 50)) | ((df["salary"] == ">50K") & (df["salary K$"] > 50))]
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 30162 entries, 1 to 32561
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   index            30162 non-null  float64
 1   age              30162 non-null  object 
 2   workclass        30162 non-null  object 
 3   education        30162 non-null  object 
 4   material-status  30162 non-null  object 
 5   occupation       30162 non-null  object 
 6   relationship     30162 non-null  object 
 7   race             30162 non-null  object 
 8   sex              30162 non-null  object 
 9   hours-per-week   30162 non-null  object 
 10  native-country   30162 non-null  object 
 11  salary           30162 non-null  object 
 12  salary K$        30162 non-null  int64  
dtypes: float64(1), int64(1), object(11)
memory usage: 3.2+ MB
None


# Task 1
Print the count of men and women in the dataset.

In [71]:
count = df["sex"].value_counts()
print(count)

sex
Male      20380
Female     9782
Name: count, dtype: int64


# Task 2
Find the average age of men in dataset

In [93]:
df["age"] = pd.to_numeric(df["age"], errors="coerce")
average_age_men = df[df["sex"] == "Male"]["age"].mean()
print(average_age_men)


39.18400392541707


# Task 3
Get the percentage of people from Poland (native-country)

In [94]:
poland_people_count = len(df[df["native-country"] == "Poland"])
poland_percentage = (poland_people_count / len(df)) * 100
print(poland_percentage)

0.18566408063125786


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [95]:
high_salary = df[df["salary"] == ">50K"]
mean_high_salary_age = high_salary["age"].mean()
std_high_salary_age = high_salary["age"].std()
print(std_high_salary_age)

10.269632835673852


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [99]:
high_education_levels = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
high_salary_no_high_education = df[(df["salary"] == ">50K") & (~df["education"].isin(high_education_levels))]
if not high_salary_no_high_education.empty:
    print(f"There are people with salary > 50K, but without higher education: {high_salary_no_high_education}")
else:
    print("All people with salary > 50K have higher education.")

There are people with salary > 50K, but without higher education:          index  age         workclass     education     material-status   
8          7.0   52  Self-emp-not-inc       HS-grad  Married-civ-spouse  \
11        10.0   37           Private  Some-college  Married-civ-spouse   
56        55.0   43           Private  Some-college  Married-civ-spouse   
68        67.0   53           Private       HS-grad  Married-civ-spouse   
69        68.0   49      Self-emp-inc  Some-college  Married-civ-spouse   
...        ...  ...               ...           ...                 ...   
32463  32462.0   48      Self-emp-inc       HS-grad  Married-civ-spouse   
32519  32518.0   57         Local-gov       HS-grad  Married-civ-spouse   
32520  32519.0   46           Private  Some-college  Married-civ-spouse   
32558  32557.0   40           Private       HS-grad  Married-civ-spouse   
32561  32560.0   52      Self-emp-inc       HS-grad  Married-civ-spouse   

              occupation relation

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [101]:
type_of_education = df.groupby("education")["age"].describe()
print(type_of_education)

               count       mean        std   min   25%   50%   75%   max
education                                                               
10th           820.0  37.897561  16.225795  17.0  23.0  36.0  52.0  90.0
11th          1048.0  32.363550  15.089307  17.0  18.0  28.5  43.0  90.0
12th           377.0  32.013263  14.373710  17.0  19.0  28.0  41.0  79.0
1st-4th        151.0  44.622517  14.929051  19.0  33.0  44.0  56.0  81.0
5th-6th        288.0  41.649306  14.754622  17.0  28.0  41.0  53.0  82.0
7th-8th        557.0  47.631957  15.737479  17.0  34.0  49.0  60.0  90.0
9th            455.0  40.303297  15.335754  17.0  28.0  38.0  53.0  90.0
Assoc-acdm    1008.0  37.286706  10.509755  19.0  29.0  36.0  44.0  90.0
Assoc-voc     1307.0  38.246366  11.181253  19.0  30.0  37.0  45.0  84.0
Bachelors     5044.0  38.641554  11.577566  19.0  29.0  37.0  46.0  90.0
Doctorate      375.0  47.130667  11.471727  24.0  39.0  47.0  54.0  80.0
HS-grad       9840.0  38.640955  13.067730  17.0  2

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [118]:
married_men = df[(df["sex"] == "Male") & (df["material-status"].str.startswith("Married"))]
unmarried_men = df[(df["sex"] == "Male") & ~(df["material-status"].str.startswith("Married"))]

married_men_salary = married_men["salary"].value_counts(normalize=True)
unmarried_men_salary = unmarried_men["salary"].value_counts(normalize=True)

print("Percentage of married men who earn >50K: {:.2f}%".format(married_men_salary[">50K"] * 100))
print("Percentage of married men who earn <=50K: {:.2f}%".format(married_men_salary["<=50K"] * 100))
print("Percentage of unmarried men who earn >50K: {:.2f}%".format(unmarried_men_salary[">50K"] * 100))
print("Percentage of unmarried men who earn <=50K: {:.2f}%".format(unmarried_men_salary["<=50K"] * 100))

Percentage of married men who earn >50K: 44.80%
Percentage of married men who earn <=50K: 55.20%
Percentage of unmarried men who earn >50K: 8.85%
Percentage of unmarried men who earn <=50K: 91.15%


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [119]:
max_hours = df["hours-per-week"].max()
num_people_max_hours = df["hours-per-week"].value_counts()[max_hours]
print("Maximum hours per week:", max_hours)
print("Number of people who work that many hours:", num_people_max_hours)

Maximum hours per week: 99
Number of people who work that many hours: 78


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [124]:
df[["age", "hours-per-week", "salary K$"]].corr()

Unnamed: 0,age,hours-per-week,salary K$
age,1.0,0.101599,0.208203
hours-per-week,0.101599,1.0,0.196378
salary K$,0.208203,0.196378,1.0
