In [108]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [109]:
data = pd.read_csv(r"C:\Users\katya\projects\py-adult-data-analysis\data\adult.csv")

data.replace("?", pd.NA, inplace=True)
data.dropna(inplace=True)

def check_salary(row):
    if row["salary"] == ">50K":
        return row["salary K$"] > 50
    else:
        return row["salary K$"] <= 50
    
data["salary_correct"] = data.apply(check_salary, axis=1)

data = data[data["salary_correct"]]
data.drop("salary_correct", axis=1, inplace=True)

print(data.head())

   Unnamed: 0  age         workclass  education      marital-status  \
0           0   39         State-gov  Bachelors       Never-married   
1           1   50  Self-emp-not-inc  Bachelors  Married-civ-spouse   
2           2   38           Private    HS-grad            Divorced   
3           3   53           Private       11th  Married-civ-spouse   
4           4   28           Private  Bachelors  Married-civ-spouse   

          occupation   relationship   race     sex  hours-per-week  \
0       Adm-clerical  Not-in-family  White    Male              40   
1    Exec-managerial        Husband  White    Male              13   
2  Handlers-cleaners  Not-in-family  White    Male              40   
3  Handlers-cleaners        Husband  Black    Male              40   
4     Prof-specialty           Wife  Black  Female              40   

  native-country salary  salary K$  
0  United-States  <=50K         39  
1  United-States  <=50K         35  
2  United-States  <=50K         27  
3  U

# Task 1
Print the count of men and women in the dataset.

In [110]:
men_number = data.loc[data["sex"] == "Male", "sex"].count()
women_number = data.loc[data["sex"] == "Female", "sex"].count()

print(f"Men: {men_number}, Women: {women_number}")

Men: 20380, Women: 9782


# Task 2
Find the average age of men in dataset

In [111]:
average_men_age = data.loc[data["sex"] == "Male", "age"].mean()
print(int(average_men_age))

39


# Task 3
Get the percentage of people from Poland (native-country)

In [112]:
total_count = data.shape[0]
people_from_poland = data[data["native-country"] == "Poland"].shape[0]

percentage_from_poland = people_from_poland / total_count * 100
print(round(percentage_from_poland, 1))

0.2


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [113]:
age_data_greater_50K = data[data["salary"] == ">50K"]["age"].agg([np.mean, np.std])
age_data_less_equal_50K = data[data["salary"] == "<=50K"]["age"].agg([np.mean, np.std])

print(f"{age_data_greater_50K.round(1)},\n{age_data_less_equal_50K.round(1)}")

mean    44.0
std     10.3
Name: age, dtype: float64,
mean    36.6
std     13.5
Name: age, dtype: float64


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [114]:
higher_education = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
people = data[
    (~data["education"].isin(higher_education)) &
    (data["salary"] == ">50K")
]

print(people)

       Unnamed: 0  age         workclass     education      marital-status  \
7               7   52  Self-emp-not-inc       HS-grad  Married-civ-spouse   
10             10   37           Private  Some-college  Married-civ-spouse   
55             55   43           Private  Some-college  Married-civ-spouse   
67             67   53           Private       HS-grad  Married-civ-spouse   
68             68   49      Self-emp-inc  Some-college  Married-civ-spouse   
...           ...  ...               ...           ...                 ...   
32462       32462   48      Self-emp-inc       HS-grad  Married-civ-spouse   
32518       32518   57         Local-gov       HS-grad  Married-civ-spouse   
32519       32519   46           Private  Some-college  Married-civ-spouse   
32557       32557   40           Private       HS-grad  Married-civ-spouse   
32560       32560   52      Self-emp-inc       HS-grad  Married-civ-spouse   

              occupation relationship   race     sex  hours-per

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [115]:
education_age_stat = data.groupby("education")["age"].describe()

print(education_age_stat)

               count       mean        std   min   25%   50%   75%   max
education                                                               
10th           820.0  37.897561  16.225795  17.0  23.0  36.0  52.0  90.0
11th          1048.0  32.363550  15.089307  17.0  18.0  28.5  43.0  90.0
12th           377.0  32.013263  14.373710  17.0  19.0  28.0  41.0  79.0
1st-4th        151.0  44.622517  14.929051  19.0  33.0  44.0  56.0  81.0
5th-6th        288.0  41.649306  14.754622  17.0  28.0  41.0  53.0  82.0
7th-8th        557.0  47.631957  15.737479  17.0  34.0  49.0  60.0  90.0
9th            455.0  40.303297  15.335754  17.0  28.0  38.0  53.0  90.0
Assoc-acdm    1008.0  37.286706  10.509755  19.0  29.0  36.0  44.0  90.0
Assoc-voc     1307.0  38.246366  11.181253  19.0  30.0  37.0  45.0  84.0
Bachelors     5044.0  38.641554  11.577566  19.0  29.0  37.0  46.0  90.0
Doctorate      375.0  47.130667  11.471727  24.0  39.0  47.0  54.0  80.0
HS-grad       9840.0  38.640955  13.067730  17.0  2

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [116]:
married_people = data["marital-status"].str.startswith("Married")
married_men = data[
    (data["sex"] == "Male") &
    married_people
]
non_married_men = data[
    (data["sex"] == "Male") &
    ~married_people
]

married_men_salary = married_men["salary"].value_counts()
non_married_men_salary = non_married_men["salary"].value_counts()

print(f"Merried men:\n{married_men_salary}\n"
      f"Non-married men:\n{non_married_men_salary}")

Merried men:
<=50K    7052
>50K     5723
Name: salary, dtype: int64
Non-married men:
<=50K    6932
>50K      673
Name: salary, dtype: int64


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [117]:
max_hours_per_week = data["hours-per-week"].max()
people_working_max_hours = data[data["hours-per-week"] == max_hours_per_week].shape[0]

print(f"Max hours: {max_hours_per_week}\nCount workers:{people_working_max_hours}")

Max hours: 99
Count workers:78


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [118]:
correlation_matrix = data.corr()

print(correlation_matrix)

                Unnamed: 0       age  hours-per-week  salary K$
Unnamed: 0        1.000000 -0.001126       -0.001890   0.000129
age              -0.001126  1.000000        0.101599   0.208203
hours-per-week   -0.001890  0.101599        1.000000   0.196378
salary K$         0.000129  0.208203        0.196378   1.000000
