In [1]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [7]:
df = pd.read_csv("C:/Users/User/python_advanced/py-adult-data-analysis/data/adult.csv")
df = df.replace("?", np.nan)
df = df.dropna()

print("Unique values in the 'salary' column:", df["salary"].unique())
df["salary $K"] = df["salary"].apply(lambda x: 0 if x == "<=50K" else 1)

print("Unique values in the new 'salary $K' column:", df["salary $K"].unique())

Unique values in the 'salary' column: ['<=50K' '>50K']
Unique values in the new 'salary $K' column: [0 1]


# Task 1
Print the count of men and women in the dataset.

In [8]:
# write your code for task 1 here
sex_counts = df["sex"].value_counts()
print("Count of men:", sex_counts["Male"])
print("Count of women:", sex_counts["Female"])

Count of men: 20380
Count of women: 9782


# Task 2
Find the average age of men in dataset

In [9]:
# write your code for task 2 here
average_age_men = df[df["sex"] == "Male"]["age"].mean()
print("Average age of men:", average_age_men)

Average age of men: 39.18400392541707


# Task 3
Get the percentage of people from Poland (native-country)

In [10]:
# write your code for task 3 here
poland_count = df[df["native-country"] == "Poland"].shape[0]
total_count = df.shape[0]
percentage_poland = (poland_count / total_count) * 100
print("Percentage of people from Poland:", percentage_poland)

Percentage of people from Poland: 0.18566408063125786


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [11]:
# write your code for task 4 here
mean_age_high_income = df[df["salary"] == ">50K"]["age"].mean()
std_age_high_income = df[df["salary"] == ">50K"]["age"].std()

mean_age_low_income = df[df["salary"] == "<=50K"]["age"].mean()
std_age_low_income = df[df["salary"] == "<=50K"]["age"].std()

print("For people who earn > $50K per year:")
print("Mean age:", mean_age_high_income)
print("Standard deviation of age:", std_age_high_income)
print()
print("For people who earn <= $50K per year:")
print("Mean age:", mean_age_low_income)
print("Standard deviation of age:", std_age_low_income)

For people who earn > $50K per year:
Mean age: 43.95911028236548
Standard deviation of age: 10.269632835673852

For people who earn <= $50K per year:
Mean age: 36.60806038668668
Standard deviation of age: 13.464631257161633


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [12]:
# write your code for task 5 here
higher_education_levels = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]

people_without_higher_education = df[(df["salary"] == ">50K") & (~df["education"].isin(higher_education_levels))]

if people_without_higher_education.empty:
    print("There are no people with salary > $50K and without higher education.")
else:
    print("There are people with salary > $50K but without higher education:")
    print(people_without_higher_education)

There are people with salary > $50K but without higher education:
       Unnamed: 0  age         workclass     education      marital-status  \
7               7   52  Self-emp-not-inc       HS-grad  Married-civ-spouse   
10             10   37           Private  Some-college  Married-civ-spouse   
55             55   43           Private  Some-college  Married-civ-spouse   
67             67   53           Private       HS-grad  Married-civ-spouse   
68             68   49      Self-emp-inc  Some-college  Married-civ-spouse   
...           ...  ...               ...           ...                 ...   
32462       32462   48      Self-emp-inc       HS-grad  Married-civ-spouse   
32518       32518   57         Local-gov       HS-grad  Married-civ-spouse   
32519       32519   46           Private  Some-college  Married-civ-spouse   
32557       32557   40           Private       HS-grad  Married-civ-spouse   
32560       32560   52      Self-emp-inc       HS-grad  Married-civ-spouse  

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [13]:
# write your code for task 6 here
education_age_stats = df.groupby("education")["age"].describe()

print("Statistics of age for each type of education:")
print(education_age_stats)

Statistics of age for each type of education:
               count       mean        std   min   25%   50%   75%   max
education                                                               
10th           820.0  37.897561  16.225795  17.0  23.0  36.0  52.0  90.0
11th          1048.0  32.363550  15.089307  17.0  18.0  28.5  43.0  90.0
12th           377.0  32.013263  14.373710  17.0  19.0  28.0  41.0  79.0
1st-4th        151.0  44.622517  14.929051  19.0  33.0  44.0  56.0  81.0
5th-6th        288.0  41.649306  14.754622  17.0  28.0  41.0  53.0  82.0
7th-8th        557.0  47.631957  15.737479  17.0  34.0  49.0  60.0  90.0
9th            455.0  40.303297  15.335754  17.0  28.0  38.0  53.0  90.0
Assoc-acdm    1008.0  37.286706  10.509755  19.0  29.0  36.0  44.0  90.0
Assoc-voc     1307.0  38.246366  11.181253  19.0  30.0  37.0  45.0  84.0
Bachelors     5044.0  38.641554  11.577566  19.0  29.0  37.0  46.0  90.0
Doctorate      375.0  47.130667  11.471727  24.0  39.0  47.0  54.0  80.0
HS-gr

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [14]:
# write your code for task 7 here
def is_married(status):
    return status.startswith("Married")

df['is_married'] = df['marital-status'].apply(is_married)

salary_comparison = df[df['sex'] == 'Male'].groupby(['is_married', 'salary']).size()

print("Comparison of salaries for married and non-married men:")
print(salary_comparison)

Comparison of salaries for married and non-married men:
is_married  salary
False       <=50K     6932
            >50K       673
True        <=50K     7052
            >50K      5723
dtype: int64


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [15]:
# write your code for task 8 here
max_hours_per_week = df['hours-per-week'].max()

num_people_max_hours = df[df['hours-per-week'] == max_hours_per_week].shape[0]

print("Maximum hours per week:", max_hours_per_week)
print("Number of people who work the same amount of hours per week:", num_people_max_hours)

Maximum hours per week: 99
Number of people who work the same amount of hours per week: 78


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [17]:
# write your code for task 9 here
numeric_df = df.select_dtypes(include=['int64', 'float64'])

correlation_matrix = numeric_df.corr()

print("Correlation Matrix:")
print(correlation_matrix)

high_correlation = correlation_matrix[abs(correlation_matrix) >= 0.5]
print("\nFields with high correlation (absolute value >= 0.5):")
print(high_correlation)

Correlation Matrix:
                Unnamed: 0       age  hours-per-week  salary K$  salary $K
Unnamed: 0        1.000000 -0.001126       -0.001890   0.000129   0.006244
age              -0.001126  1.000000        0.101599   0.208203   0.241998
hours-per-week   -0.001890  0.101599        1.000000   0.196378   0.229480
salary K$         0.000129  0.208203        0.196378   1.000000   0.853894
salary $K         0.006244  0.241998        0.229480   0.853894   1.000000

Fields with high correlation (absolute value >= 0.5):
                Unnamed: 0  age  hours-per-week  salary K$  salary $K
Unnamed: 0             1.0  NaN             NaN        NaN        NaN
age                    NaN  1.0             NaN        NaN        NaN
hours-per-week         NaN  NaN             1.0        NaN        NaN
salary K$              NaN  NaN             NaN   1.000000   0.853894
salary $K              NaN  NaN             NaN   0.853894   1.000000
