In [2]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [3]:
df = pd.read_csv("../data/adult.csv")

df = df[~df.apply(lambda row: any(row == "?"), axis=1)]

condition = df["salary"].astype(str).str[0] == ">"

df.loc[condition, "salary"] = True
df.loc[~condition, "salary"] = False

correct_rows = df[(df["salary"] & df["salary K$"] > 50) | (~df["salary"] & df["salary K$"] <= 50)]


df = df[df.index.isin(correct_rows.index)]

# Task 1
Print the count of men and women in the dataset.

In [4]:
sex_counts = df["sex"].value_counts()

print("Count of Men:", sex_counts.get("Male", 0))
print("Count of Women:", sex_counts.get("Female", 0))

Count of Men: 20380
Count of Women: 9782


# Task 2
Find the average age of men in dataset

In [5]:
average_age_men = df[df["sex"] == "Male"]["age"].mean()

print(np.floor(average_age_men))

39.0


# Task 3
Get the percentage of people from Poland (native-country)

In [6]:
total_people = len(df)
polish_percentage = (df["native-country"].value_counts().get("Poland", 0) / total_people) * 100

print(polish_percentage)

0.18566408063125786


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [7]:
income_groups = df.groupby("salary")

above_50k_mean_age = income_groups.get_group(True)["age"].mean()
above_50k_std_age = income_groups.get_group(True)["age"].std()

print("Mean age >50K:", above_50k_mean_age)
print("Standard deviation >50K:", above_50k_std_age)

below_50k_mean_age = income_groups.get_group(False)["age"].mean()
below_50k_std_age = income_groups.get_group(False)["age"].std()

print("\nMean age <=50K:", below_50k_mean_age)
print("Standard deviation <=50K:", below_50k_std_age)


Mean age >50K: 43.95911028236548
Standard deviation >50K: 10.269632835673852

Mean age <=50K: 36.60806038668668
Standard deviation <=50K: 13.464631257161633


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [8]:
# Assuming 'education' and 'income' are the column names
higher_education_levels = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]

# Check if there are people with > 50K salary and not in higher education levels
people_without_higher_education = df[(df["salary"] == True) & ~df["education"].isin(higher_education_levels)]

print(len(people_without_higher_education))

3178


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [9]:
education_age_statistics = df.groupby('education')['age'].describe()

print(education_age_statistics)

               count       mean        std   min   25%   50%   75%   max
education                                                               
10th           820.0  37.897561  16.225795  17.0  23.0  36.0  52.0  90.0
11th          1048.0  32.363550  15.089307  17.0  18.0  28.5  43.0  90.0
12th           377.0  32.013263  14.373710  17.0  19.0  28.0  41.0  79.0
1st-4th        151.0  44.622517  14.929051  19.0  33.0  44.0  56.0  81.0
5th-6th        288.0  41.649306  14.754622  17.0  28.0  41.0  53.0  82.0
7th-8th        557.0  47.631957  15.737479  17.0  34.0  49.0  60.0  90.0
9th            455.0  40.303297  15.335754  17.0  28.0  38.0  53.0  90.0
Assoc-acdm    1008.0  37.286706  10.509755  19.0  29.0  36.0  44.0  90.0
Assoc-voc     1307.0  38.246366  11.181253  19.0  30.0  37.0  45.0  84.0
Bachelors     5044.0  38.641554  11.577566  19.0  29.0  37.0  46.0  90.0
Doctorate      375.0  47.130667  11.471727  24.0  39.0  47.0  54.0  80.0
HS-grad       9840.0  38.640955  13.067730  17.0  2

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [12]:
df["marital-status-short"] = df["marital-status"].apply(lambda x: "Married" if x.startswith("Married") else "Not Married")

marital_status_short = df[df["sex"] == "Male"].groupby("marital-status-short")

results = marital_status_short["salary"].value_counts(normalize=True).unstack().fillna(0) * 100

print(results)

salary                    False      True 
marital-status-short                      
Married               55.201566  44.798434
Not Married           91.150559   8.849441


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [13]:
max_hours_per_week = df["hours-per-week"].max()

people_with_max_hours = df[df["hours-per-week"] == max_hours_per_week]

num_people_with_max_hours = len(people_with_max_hours)

print("Max hours per week:", max_hours_per_week)
print("Number of people working max amount of hours:", num_people_with_max_hours)

Max hours per week: 99
Number of people working max amount of hours: 78


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [21]:
numeric_columns = df.select_dtypes(include="number")

correlation_matrix = numeric_columns.corr()

print("Correlation Matrix:")
print(correlation_matrix)

highly_correlated_fields = (correlation_matrix.abs() > 0.7) & (correlation_matrix.abs() < 1)
correlation_pairs = []

for col in highly_correlated_fields.columns:
    correlated_cols = highly_correlated_fields.index[highly_correlated_fields[col]].tolist()
    if correlated_cols:
        correlation_pairs.append((col, correlated_cols))

if correlation_pairs:
    print("\n", correlation_pairs, highly_correlated_fields)
else:
    print("\nNo highly correlated fields!")

Correlation Matrix:
                Unnamed: 0       age  hours-per-week  salary K$
Unnamed: 0        1.000000 -0.001126       -0.001890   0.000129
age              -0.001126  1.000000        0.101599   0.208203
hours-per-week   -0.001890  0.101599        1.000000   0.196378
salary K$         0.000129  0.208203        0.196378   1.000000

No highly correlated fields!
