In [None]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [17]:
df = pd.read_csv('../data/adult.csv')
df = df[(df[['age', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'hours-per-week', 'native-country', 'salary', 'salary K$']] != '?').all(axis=1)]
df['salary_check'] = (df['salary'] == '>50K') & (df['salary K$'] > 50) | (df['salary'] == '<=50K') & (df['salary K$'] <= 50)
correct_rows = df[df['salary_check']]
print("The number of rows with correct salary is: ", len(correct_rows))
df.info

The number of rows with correct salary is:  30162


<bound method DataFrame.info of        Unnamed: 0  age         workclass   education      marital-status  \
0               0   39         State-gov   Bachelors       Never-married   
1               1   50  Self-emp-not-inc   Bachelors  Married-civ-spouse   
2               2   38           Private     HS-grad            Divorced   
3               3   53           Private        11th  Married-civ-spouse   
4               4   28           Private   Bachelors  Married-civ-spouse   
...           ...  ...               ...         ...                 ...   
32556       32556   27           Private  Assoc-acdm  Married-civ-spouse   
32557       32557   40           Private     HS-grad  Married-civ-spouse   
32558       32558   58           Private     HS-grad             Widowed   
32559       32559   22           Private     HS-grad       Never-married   
32560       32560   52      Self-emp-inc     HS-grad  Married-civ-spouse   

              occupation   relationship   race     sex 

# Task 1
Print the count of men and women in the dataset.

In [12]:
print(df['sex'].value_counts())

sex
Male      20380
Female     9782
Name: count, dtype: int64


# Task 2
Find the average age of men in dataset

In [14]:
average_age_men = round(df[df['sex'] == 'Male']['age'].mean(), 2)
print("Average age of men in dataset: ", average_age_men)

Average age of men in dataset:  39.18


# Task 3
Get the percentage of people from Poland (native-country)

In [15]:
total_people = df.shape[0]  
people_from_poland = df[df['native-country'] == 'Poland'].shape[0]  
percentage_poland = (people_from_poland / total_people) * 100
percentage_poland = round(percentage_poland, 2)  
print("The percentage of people from Poland: ", percentage_poland)

The percentage of people from Poland:  0.19


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [16]:
high_income = df[df['salary'] == '>50K']
mean_age_high_income = round(high_income['age'].mean(), 2)
std_age_high_income = round(high_income['age'].std(), 2)
print('Mean age of high income people:', mean_age_high_income)
print('Standard deviation of age for high income people:', std_age_high_income)

low_income = df[df['salary'] == '<=50K']
mean_age_low_income = round(low_income['age'].mean(), 2)
std_age_low_income = round(low_income['age'].std(), 2)
print('Mean age of low income people:', mean_age_low_income)
print('Standard deviation of age for low income people:', std_age_low_income)

Mean age of high income people: 43.96
Standard deviation of age for high income people: 10.27
Mean age of low income people: 36.61
Standard deviation of age for low income people: 13.46


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [22]:
high_education = ['Bachelors', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', 'Masters', 'Doctorate']
people_without_high_edu = df[(~df['education'].isin(high_education)) & (df['salary']==1)]
num_people_without_high_edu = people_without_high_edu.shape[0]
print("Number of people without higher education, but with > 50K salary:",num_people_without_high_edu)

Number of people without higher education, but with > 50K salary: 0


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [23]:
stats_age = df.groupby('education')['age'].describe()
print(stats_age)

               count       mean        std   min   25%   50%   75%   max
education                                                               
10th           820.0  37.897561  16.225795  17.0  23.0  36.0  52.0  90.0
11th          1048.0  32.363550  15.089307  17.0  18.0  28.5  43.0  90.0
12th           377.0  32.013263  14.373710  17.0  19.0  28.0  41.0  79.0
1st-4th        151.0  44.622517  14.929051  19.0  33.0  44.0  56.0  81.0
5th-6th        288.0  41.649306  14.754622  17.0  28.0  41.0  53.0  82.0
7th-8th        557.0  47.631957  15.737479  17.0  34.0  49.0  60.0  90.0
9th            455.0  40.303297  15.335754  17.0  28.0  38.0  53.0  90.0
Assoc-acdm    1008.0  37.286706  10.509755  19.0  29.0  36.0  44.0  90.0
Assoc-voc     1307.0  38.246366  11.181253  19.0  30.0  37.0  45.0  84.0
Bachelors     5044.0  38.641554  11.577566  19.0  29.0  37.0  46.0  90.0
Doctorate      375.0  47.130667  11.471727  24.0  39.0  47.0  54.0  80.0
HS-grad       9840.0  38.640955  13.067730  17.0  2

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [25]:
mask_married = df['marital-status'].str.startswith("Married")

married_men = df[(df['sex'] == 'Male') & mask_married]
non_married_men = df[(df['sex'] == 'Male') & ~mask_married]

high_income_married = married_men[married_men['salary'] == '>50K'].shape[0]
low_income_married = married_men[married_men['salary'] == '<=50K'].shape[0]

high_income_non_married = non_married_men[non_married_men['salary'] == '>50K'].shape[0]
low_income_non_married = non_married_men[non_married_men['salary'] == '<=50K'].shape[0]

print("Married men earning >50K: ", high_income_married)
print("Married men earning <=50K: ", low_income_married)
print("Non-married men earning >50K: ", high_income_non_married)
print("Non-married men earning <=50K: ", low_income_non_married)

Married men earning >50K:  5723
Married men earning <=50K:  7052
Non-married men earning >50K:  673
Non-married men earning <=50K:  6932


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [26]:
max_hours_per_week = df['hours-per-week'].max()
print("Max hours per week:", max_hours_per_week)

num_people_same_hours = (df['hours-per-week'] == max_hours_per_week).sum()
print("Number of people working the same max hours per week:", num_people_same_hours)

Max hours per week: 99
Number of people working the same max hours per week: 78


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [29]:
df_dummies = pd.get_dummies(df)
correlation = df_dummies.corr()
print(correlation)

                              Unnamed: 0       age  hours-per-week  salary K$  \
Unnamed: 0                      1.000000 -0.001126       -0.001890   0.000129   
age                            -0.001126  1.000000        0.101599   0.208203   
hours-per-week                 -0.001890  0.101599        1.000000   0.196378   
salary K$                       0.000129  0.208203        0.196378   1.000000   
salary_check                         NaN       NaN             NaN        NaN   
...                                  ...       ...             ...        ...   
native-country_United-States    0.001993  0.016259        0.010673   0.034000   
native-country_Vietnam         -0.000928 -0.017775       -0.010381  -0.014737   
native-country_Yugoslavia       0.004350  0.000657        0.006983   0.008560   
salary_<=50K                   -0.006244 -0.241998       -0.229480  -0.853894   
salary_>50K                     0.006244  0.241998        0.229480   0.853894   

                           