In [34]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [35]:
dataset = pd.read_csv("../data/adult.csv", na_values="?")
correct_format_of_salary = dataset["salary"].str.match(r"[<>]=?\d+[Kk]").all()
print(correct_format_of_salary)

True


# Task 1
Print the count of men and women in the dataset.

In [36]:
gender_counts = dataset["sex"].value_counts()
print(gender_counts)

sex
Male      21790
Female    10771
Name: count, dtype: int64


# Task 2
Find the average age of men in dataset

In [37]:
average_age_of_men = dataset[dataset["sex"] == "Male"]["age"].mean()
print(average_age_of_men.round(0))

39.0


# Task 3
Get the percentage of people from Poland (native-country)

In [38]:
people_from_poland = (len(dataset[dataset["native-country"] == "Poland"]) / len(dataset)) * 100
print(people_from_poland)

0.18426952489174164


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [39]:
rich_guys_mean_age = dataset[dataset['salary'] == '>50K']['age'].mean()
print(rich_guys_mean_age)
rich_guys_std_age = dataset[dataset['salary'] == '>50K']['age'].std()
print(rich_guys_std_age)
poor_guys_mean_age = dataset[dataset['salary'] == '<=50K']['age'].mean()
print(poor_guys_mean_age)
poor_guys_std_age = dataset[dataset['salary'] == '<=50K']['age'].std()
print(poor_guys_std_age)

44.24984058155847
10.519027719851826
36.78373786407767
14.02008849082488


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [40]:
number_of_rich_and_uneducated = dataset[~dataset['education'].isin(['Bachelors', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', 'Masters', 'Doctorate']) & (dataset['salary'] == '>50K')]
print(len(number_of_rich_and_uneducated))

3306


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [41]:
education = dataset.groupby('education')['age'].describe()
print(education)

                count       mean        std   min    25%   50%   75%   max
education                                                                 
10th            933.0  37.429796  16.720713  17.0  22.00  34.0  52.0  90.0
11th           1175.0  32.355745  15.545485  17.0  18.00  28.0  43.0  90.0
12th            433.0  32.000000  14.334625  17.0  19.00  28.0  41.0  79.0
1st-4th         168.0  46.142857  15.615625  19.0  33.00  46.0  57.0  90.0
5th-6th         333.0  42.885886  15.557285  17.0  29.00  42.0  54.0  84.0
7th-8th         646.0  48.445820  16.092350  17.0  34.25  50.0  61.0  90.0
9th             514.0  41.060311  15.946862  17.0  28.00  39.0  54.0  90.0
Assoc-acdm     1067.0  37.381443  11.095177  19.0  29.00  36.0  44.0  90.0
Assoc-voc      1382.0  38.553546  11.631300  19.0  30.00  37.0  46.0  84.0
Bachelors      5355.0  38.904949  11.912210  19.0  29.00  37.0  46.0  90.0
Doctorate       413.0  47.702179  11.784716  24.0  39.00  47.0  55.0  80.0
HS-grad       10501.0  38

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [42]:
def extract_salary(salary_string):
    salary_int = ""
    for i in salary_string:
        if i.isdigit():
            salary_int += i
    return int(salary_int)

dataset["salary"] = dataset["salary"].apply(extract_salary)

married_men = dataset[(dataset["sex"] == "Male") & (dataset["marital-status"].str.startswith("Married"))]
non_married_men = dataset[(dataset["sex"] == "Male") & (~dataset["marital-status"].str.startswith("Married"))]

married_men_salary_mean = married_men["salary"].mean()
non_married_men_salary_mean = non_married_men["salary"].mean()

print(married_men_salary_mean > non_married_men_salary_mean)


False


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [43]:
max_hours = dataset["hours-per-week"].max()
people_same_max_hours = (dataset['hours-per-week'] == max_hours).sum()
print(max_hours, people_same_max_hours)

99 85


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [44]:
numeric_columns = dataset.select_dtypes(include=np.number)
correlation_matrix = numeric_columns.corr()
print(correlation_matrix)

                Unnamed: 0       age  hours-per-week  salary  salary K$
Unnamed: 0        1.000000  0.001286        0.000607     NaN  -0.001666
age               0.001286  1.000000        0.068756     NaN   0.201774
hours-per-week    0.000607  0.068756        1.000000     NaN   0.196916
salary                 NaN       NaN             NaN     NaN        NaN
salary K$        -0.001666  0.201774        0.196916     NaN   1.000000
