In [12]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [23]:
df = pd.read_csv("adult.csv")
df = df[~df.isin(["?"]).any(axis=1)]
df.head()

Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
0,0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K,39
1,1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K,35
2,2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K,27
3,3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K,43
4,4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K,25


# Task 1
Print the count of men and women in the dataset.

In [27]:
gender_counts = df["sex"].value_counts()
print(gender_counts)

Male      20380
Female     9782
Name: sex, dtype: int64


# Task 2
Find the average age of men in dataset

In [28]:
average_age_men = df[df["sex"] == "Male"]["age"].mean()
print(average_age_men)

39.18400392541707


# Task 3
Get the percentage of people from Poland (native-country)

In [26]:
total_count = len(df)
poland_count = len(df[df["native-country"] == "Poland"])
percentage_poland = (poland_count / total_count) * 100
print(percentage_poland)

0.18566408063125786


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [29]:
high_earning_age = df[df["salary"] == ">50K"]["age"]
mean_age_high_earning = high_earning_age.mean()
std_age_high_earning = high_earning_age.std()
print("Mean age for > 50K: ", mean_age_high_earning)
print("Standard deviation for > 50K: ", std_age_high_earning)

low_earning_age = df[df["salary"] == "<=50K"]["age"]
mean_age_low_earning = low_earning_age.mean()
std_age_low_earning = low_earning_age.std()
print("Mean age for <= 50K: ", mean_age_low_earning)
print("Standard deviation for <= 50K: ", std_age_low_earning)

Mean age for > 50K:  43.95911028236548
Standard deviation for > 50K:  10.269632835673852
Mean age for <= 50K:  36.60806038668668
Standard deviation for <= 50K:  13.464631257161633


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [30]:
filtered_df = df[
    (df["education"].isin(["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"])) &
    (df["salary"] == ">50K")
]

if filtered_df.empty:
    print("There are no people without higher education and > 50K salary.")
else:
    print("There are people without higher education but with > 50K salary.")
    print(filtered_df)

There are people without higher education but with > 50K salary.
       Unnamed: 0  age         workclass   education      marital-status  \
8               8   31           Private     Masters       Never-married   
9               9   42           Private   Bachelors  Married-civ-spouse   
11             11   30         State-gov   Bachelors  Married-civ-spouse   
19             19   43  Self-emp-not-inc     Masters            Divorced   
20             20   40           Private   Doctorate  Married-civ-spouse   
...           ...  ...               ...         ...                 ...   
32533       32533   54           Private   Bachelors  Married-civ-spouse   
32536       32536   34           Private   Bachelors       Never-married   
32538       32538   38           Private   Bachelors            Divorced   
32545       32545   39         Local-gov  Assoc-acdm  Married-civ-spouse   
32554       32554   53           Private     Masters  Married-civ-spouse   

            occupation

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [31]:
age_stats_by_education = df.groupby("education")["age"].describe()
print(age_stats_by_education)

               count       mean        std   min   25%   50%   75%   max
education                                                               
10th           820.0  37.897561  16.225795  17.0  23.0  36.0  52.0  90.0
11th          1048.0  32.363550  15.089307  17.0  18.0  28.5  43.0  90.0
12th           377.0  32.013263  14.373710  17.0  19.0  28.0  41.0  79.0
1st-4th        151.0  44.622517  14.929051  19.0  33.0  44.0  56.0  81.0
5th-6th        288.0  41.649306  14.754622  17.0  28.0  41.0  53.0  82.0
7th-8th        557.0  47.631957  15.737479  17.0  34.0  49.0  60.0  90.0
9th            455.0  40.303297  15.335754  17.0  28.0  38.0  53.0  90.0
Assoc-acdm    1008.0  37.286706  10.509755  19.0  29.0  36.0  44.0  90.0
Assoc-voc     1307.0  38.246366  11.181253  19.0  30.0  37.0  45.0  84.0
Bachelors     5044.0  38.641554  11.577566  19.0  29.0  37.0  46.0  90.0
Doctorate      375.0  47.130667  11.471727  24.0  39.0  47.0  54.0  80.0
HS-grad       9840.0  38.640955  13.067730  17.0  2

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [36]:
df["marital_status"] = df["marital-status"].str.startswith("Married")
salary_counts = df.groupby(["marital_status", "salary"])["age"].count().reset_index()

married_50k_count = salary_counts[(salary_counts["marital_status"] == True) & (salary_counts["salary"] == ">50K")]["age"].values[0] if ">50K" in salary_counts["salary"].values else 0

married_lessthan50k_count = salary_counts[(salary_counts["marital_status"] == True) & (salary_counts["salary"] == "<=50K")]["age"].values[0] if "<=50K" in salary_counts["salary"].values else 0

nonmarried_50k_count = salary_counts[(salary_counts["marital_status"] == False) & (salary_counts["salary"] == ">50K")]["age"].values[0] if ">50K" in salary_counts["salary"].values else 0

nonmarried_lessthan50k_count = salary_counts[(salary_counts["marital_status"] == False) & (salary_counts["salary"] == "<=50K")]["age"].values[0] if "<=50K" in salary_counts["salary"].values else 0

if married_50k_count > nonmarried_50k_count:
    print("Married men earn more (>50K) than non-married men.")
elif married_50k_count < nonmarried_50k_count:
    print("Non-married men earn more (>50K) than married men.")
else:
    print("Married and non-married men have an equal count of salaries (>50K).")

Married men earn more (>50K) than non-married men.


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [37]:
max_hours = df['hours-per-week'].max()

same_hours_count = df[df['hours-per-week'] == max_hours].shape[0]

print("Maximum hours per week:", max_hours)
print("Number of people working the same hours:", same_hours_count)

Maximum hours per week: 99
Number of people working the same hours: 78


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [38]:
result = df.corr()

print(result)

                Unnamed: 0       age  hours-per-week  salary K$  \
Unnamed: 0        1.000000 -0.001126       -0.001890   0.000129   
age              -0.001126  1.000000        0.101599   0.208203   
hours-per-week   -0.001890  0.101599        1.000000   0.196378   
salary K$         0.000129  0.208203        0.196378   1.000000   
marital_status    0.003732  0.310553        0.221492   0.375202   

                marital_status  
Unnamed: 0            0.003732  
age                   0.310553  
hours-per-week        0.221492  
salary K$             0.375202  
marital_status        1.000000  
