In [2]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contain `?` in some columns.
Also check for data correctness (salary & salary $K).

In [9]:
df = pd.read_csv("../data/adult.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
0,0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K,39
1,1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K,35
2,2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K,27
3,3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K,43
4,4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K,25


In [12]:
df = df[~df.applymap(lambda x: "?" in str(x)).any(axis=1)]

In [70]:
df["salary_range_check"] = df.apply(lambda row: (row["salary"] == "<=50K" and row["salary K$"] <= 50)
                                      or (row["salary"] == ">50K" and row["salary K$"] > 50), axis=1)
df = df[df["salary_range_check"] == True]

# Task 1
Print the count of men and women in the dataset.

In [71]:
gender_count = df["sex"].value_counts()
print(gender_count)

Male      20380
Female     9782
Name: sex, dtype: int64


# Task 2
Find the average age of men in dataset

In [20]:
average_age_of_men = df[df["sex"] == "Male"]["age"].mean()
print("Average age of men:", round(average_age_of_men))

Average age of men: 39


# Task 3
Get the percentage of people from Poland (native-country)

In [31]:
people_from_poland = len(df[df["native-country"] == "Poland"]) / len(df) * 100
print("The percentage of people from Poland:", round(people_from_poland, 2), "%")

The percentage of people from Poland: 0.19 %


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [38]:
average_age_of_people_more_than_50k = df[df["salary"] == ">50K"]["age"].mean()
std_age_of_people_more_than_50k = df[df["salary"] == ">50K"]["age"].std()
print("Average age for people who earn > 50K per year:", round(average_age_of_people_more_than_50k))
print("Standard deviation of the age for people who earn > 50K per year:", round(std_age_of_people_more_than_50k))

Average age for people who earn > 50K per year: 44
Standard deviation of the age for people who earn > 50K per year: 10


In [39]:
average_age_of_people_more_than_50k = df[df["salary"] == "<=50K"]["age"].mean()
std_age_of_people_more_than_50k = df[df["salary"] == "<=50K"]["age"].std()
print("Average age for people who earn <= 50K per year:", round(average_age_of_people_more_than_50k))
print("Standard deviation of the age for people who earn <= 50K per year:", round(std_age_of_people_more_than_50k))

Average age for people who earn <= 50K per year: 37
Standard deviation of the age for people who earn <= 50K per year: 13


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [43]:
higher_education = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]

condition = (df["education"].isin(higher_education) == False) & (df["salary"] == ">50K")
people_without_higher_edu_and_high_salary = len(df[condition])

print("People without higher education and >50K salary:", people_without_higher_edu_and_high_salary)

People without higher education and >50K salary: 3178


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [44]:
education_age_stats = df.groupby("education")["age"].describe()

print("Statistics of age for each type of education:")
print(education_age_stats)

Statistics of age for each type of education:
               count       mean        std   min   25%   50%   75%   max
education                                                               
10th           820.0  37.897561  16.225795  17.0  23.0  36.0  52.0  90.0
11th          1048.0  32.363550  15.089307  17.0  18.0  28.5  43.0  90.0
12th           377.0  32.013263  14.373710  17.0  19.0  28.0  41.0  79.0
1st-4th        151.0  44.622517  14.929051  19.0  33.0  44.0  56.0  81.0
5th-6th        288.0  41.649306  14.754622  17.0  28.0  41.0  53.0  82.0
7th-8th        557.0  47.631957  15.737479  17.0  34.0  49.0  60.0  90.0
9th            455.0  40.303297  15.335754  17.0  28.0  38.0  53.0  90.0
Assoc-acdm    1008.0  37.286706  10.509755  19.0  29.0  36.0  44.0  90.0
Assoc-voc     1307.0  38.246366  11.181253  19.0  30.0  37.0  45.0  84.0
Bachelors     5044.0  38.641554  11.577566  19.0  29.0  37.0  46.0  90.0
Doctorate      375.0  47.130667  11.471727  24.0  39.0  47.0  54.0  80.0
HS-gr

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [48]:
married_men = df[(df["sex"] == "Male") & (df["marital-status"].str.startswith("Married"))]
non_married_men = df[(df["sex"] == "Male") & (~df["marital-status"].str.startswith("Married"))]

married_high_salary_percentage = (married_men["salary"] == ">50K").mean() * 100
non_married_high_salary_percentage = (non_married_men["salary"] == ">50K").mean() * 100

print("Married men who earn >50K: {:.2f}%".format(married_high_salary_percentage))
print("Non married men who earn >50K: {:.2f}%".format(non_married_high_salary_percentage))
if married_high_salary_percentage > non_married_high_salary_percentage:
    print("Married men earn more")
else:
    print("Non-married men earn more")

Married men who earn >50K: 44.80%
Non married men who earn >50K: 8.85%
Married men earn more


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [51]:
max_hours_per_week = df["hours-per-week"].max()
print("Max hours per week some person works:", max_hours_per_week)

Max hours per week some person works: 99


In [55]:
same_hours_count = df["hours-per-week"].value_counts()
filtered_same_hours_count = same_hours_count[same_hours_count > 1]

print(filtered_same_hours_count)

40    14251
50     2718
45     1753
60     1405
35     1184
      ...  
88        2
97        2
89        2
73        2
95        2
Name: hours-per-week, Length: 89, dtype: int64


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [58]:
numeric_columns = df.select_dtypes(include=[np.number]).drop(columns=["Unnamed: 0"])
correlation_matrix = numeric_columns.corr()
print(correlation_matrix)

                     age  hours-per-week  salary K$
age             1.000000        0.101599   0.208203
hours-per-week  0.101599        1.000000   0.196378
salary K$       0.208203        0.196378   1.000000


In [59]:
print("The correlation coefficient between age and salary K$ is approximately 0.208. This suggests a weak positive correlation between age and salary. As age increases, there is a tendency for salary to increase as well, although the correlation is not very strong.")
print("The correlation coefficient between hours-per-week and salary K$ is approximately 0.196. This indicates a weak positive correlation between the number of hours worked per week and salary. People who work more hours per week tend to have slightly higher salaries, although the correlation is not very strong")
print("The correlation coefficient between age and hours-per-week is approximately 0.102. This indicates a very weak positive correlation between age and the number of hours worked per week. There is not a strong relationship between age and the number of hours worked")

The correlation coefficient between age and salary K$ is approximately 0.208. This suggests a weak positive correlation between age and salary. As age increases, there is a tendency for salary to increase as well, although the correlation is not very strong.
The correlation coefficient between hours-per-week and salary K$ is approximately 0.196. This indicates a weak positive correlation between the number of hours worked per week and salary. People who work more hours per week tend to have slightly higher salaries, although the correlation is not very strong
The correlation coefficient between age and hours-per-week is approximately 0.102. This indicates a very weak positive correlation between age and the number of hours worked per week. There is not a strong relationship between age and the number of hours worked


In [66]:
numeric_columns["is_male"] = np.where(df["sex"] == "Male", True, False)
numeric_columns["is_married"] = np.where(df["marital-status"].str.startswith("Married"), True, False)

higher_education = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
numeric_columns["has_higher_education"] = np.where(df["education"].isin(higher_education), True, False)

correlation_matrix = numeric_columns.corr()
print(correlation_matrix)

                           age  hours-per-week  salary K$   is_male  \
age                   1.000000        0.101599   0.208203  0.081993   
hours-per-week        0.101599        1.000000   0.196378  0.231268   
salary K$             0.208203        0.196378   1.000000  0.182642   
is_male               0.081993        0.231268   0.182642  1.000000   
is_married            0.310553        0.221492   0.375202  0.426348   
has_higher_education  0.078760        0.138793   0.254759  0.024537   

                      is_married  has_higher_education  
age                     0.310553              0.078760  
hours-per-week          0.221492              0.138793  
salary K$               0.375202              0.254759  
is_male                 0.426348              0.024537  
is_married              1.000000              0.091111  
has_higher_education    0.091111              1.000000  


In [67]:
print("In addition, there exist a slight correlation between the income and marital status of the person - if person is married the higher salary he/she might have.")
print("There is also a tendency that mostly men are married and men might work more hours per week.")

In addition, there exist a slight correlation between the income and marital status of the person - if person is married the higher salary he/she might have.
There is also a tendency that mostly men are married and men might work more hours per week.
