In [274]:
import numpy as np
import pandas as pd

pd.set_option("display.max.columns", 100)

import matplotlib.pyplot as plt
import seaborn as sns

In [275]:
DATA_URL = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"
data = pd.read_csv(DATA_URL + "adult.data.csv")
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### 1. How many men and women (sex feature) are represented in this dataset?

In [276]:
print( "There are {} mens".format(data[data["sex"] == "Male"]["sex"].count()) )
print( "There are {} womens".format(data[data["sex"] == "Female"]["sex"].count()) )

There are 21790 mens
There are 10771 womens


### 2. What is the average age (age feature) of women?

In [277]:
age_avg_woman = data[data["sex"] == "Female"]["age"].mean()
print(age_avg_woman)

36.85823043357163


### 3. What is the percentage of German citizens (native-country feature)?

In [278]:
data.describe(include="object")
data.pivot_table(["fnlwgt"], ["native-country"], aggfunc="count")

total = data["native-country"].count()
german_citizens = data[data["native-country"] == "Germany"]["native-country"].count()

print("Percentage of German Citizens {}%".format(100*german_citizens/total))

Percentage of German Citizens 0.42074874850281013%


### 4-5. What are the mean and standard deviation of age for those who earn more than 50K per year (salary feature) and those who earn less than 50K per year?

In [279]:
mean_more_50 = data[ data["salary"] == ">50K" ]["age"].mean()
std_more_50 = data[ data["salary"] == ">50k" ]["age"].std()
mean_less_50 = data[ data["salary"] != ">50K" ]["age"].mean()
std_less_50 = data[ data["salary"] != ">50K" ]["age"].std()

print("Mean and std from those who earn more than 50k per year {}, {}".format(mean_more_50, std_more_50))
print("Mean and std from those who earn less than 50k per year {}, {}".format(mean_less_50, std_less_50))

Mean and std from those who earn more than 50k per year 44.24984058155847, nan
Mean and std from those who earn less than 50k per year 36.78373786407767, 14.020088490824813


### 6. Is it true that people who earn more than 50K have at least high school education? (education – Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters or Doctorate feature)

In [280]:
grades = [ "Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate" ]

filtered_data = data[ data["salary"] == ">50K" ]["education"].apply(lambda value: "Si" if value in grades else "No" )
filtered_data.value_counts()

education
Si    4535
No    3306
Name: count, dtype: int64

### 7. Display age statistics for each race (race feature) and each gender (sex feature). Use groupby() and describe(). Find the maximum age of men of Amer-Indian-Eskimo race.

In [281]:
print( data["race"].unique() )
print( data["sex"].unique() )

print( data[ data["race"] == "Amer-Indian-Eskimo" ]["age"].max() )

data.groupby("race").describe()
data.groupby("sex").describe()

['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other']
['Male' 'Female']
82


Unnamed: 0_level_0,age,age,age,age,age,age,age,age,fnlwgt,fnlwgt,fnlwgt,fnlwgt,fnlwgt,fnlwgt,fnlwgt,fnlwgt,education-num,education-num,education-num,education-num,education-num,education-num,education-num,education-num,capital-gain,capital-gain,capital-gain,capital-gain,capital-gain,capital-gain,capital-gain,capital-gain,capital-loss,capital-loss,capital-loss,capital-loss,capital-loss,capital-loss,capital-loss,capital-loss,hours-per-week,hours-per-week,hours-per-week,hours-per-week,hours-per-week,hours-per-week,hours-per-week,hours-per-week
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2
Female,10771.0,36.85823,14.013697,17.0,25.0,35.0,46.0,90.0,10771.0,185746.311206,102986.078821,12285.0,117363.0,176077.0,228331.5,1484705.0,10771.0,10.035744,2.379954,1.0,9.0,10.0,12.0,16.0,10771.0,568.410547,4924.262944,0.0,0.0,0.0,0.0,99999.0,10771.0,61.187633,340.907518,0.0,0.0,0.0,0.0,4356.0,10771.0,36.410361,11.8113,1.0,30.0,40.0,40.0,99.0
Male,21790.0,39.433547,13.37063,17.0,29.0,38.0,48.0,90.0,21790.0,191771.449013,106740.702676,13769.0,118200.75,180140.0,241146.25,1455435.0,21790.0,10.102891,2.66263,1.0,9.0,10.0,13.0,16.0,21790.0,1329.370078,8326.312095,0.0,0.0,0.0,0.0,99999.0,21790.0,100.213309,429.763097,0.0,0.0,0.0,0.0,3770.0,21790.0,42.428086,12.119755,1.0,40.0,40.0,49.0,99.0


### 8. Among whom is the proportion of those who earn a lot (>50K) greater: married or single men (marital-status feature)? Consider as married those who have a marital-status starting with Married (Married-civ-spouse, Married-spouse-absent or Married-AF-spouse), the rest are considered bachelors.

In [282]:
lot_50k = data[data["salary"] == ">50K"]
data["marital-status"] = lot_50k["marital-status"].apply(lambda status : "Married" if status[0:7] == "Married" else "Single" )
data["marital-status"].value_counts(normalize=True)

marital-status
Married    0.859074
Single     0.140926
Name: proportion, dtype: float64

In [283]:
lot_50k = data[ (data["salary"] == ">50K") & (data["sex"] == "Male") ]
data["marital-status"] = lot_50k["marital-status"].apply(lambda status : "Married" if status[0:7] == "Married" else "Single" )
data["marital-status"].value_counts(normalize=True)

marital-status
Married    0.895377
Single     0.104623
Name: proportion, dtype: float64

### 9. What is the maximum number of hours a person works per week (hours-per-week feature)? How many people work such a number of hours, and what is the percentage of those who earn a lot (>50K) among them?

In [284]:
max_hours = data["hours-per-week"].max()
number_people = data[ data["hours-per-week"] == max_hours ]
number = number_people["hours-per-week"].count()

number_people.groupby("salary")["salary"].describe()

Unnamed: 0_level_0,count,unique,top,freq
salary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<=50K,60,1,<=50K,60
>50K,25,1,>50K,25


### 10. Count the average time of work (hours-per-week) for those who earn a little and a lot (salary) for each country (native-country). What will these be for Japan?

In [285]:
data_little = data[ data["salary"] == "<=50K" ].groupby("native-country")["hours-per-week"].mean()

data_more = data[ data["salary"] == ">50K" ].groupby("native-country")["hours-per-week"].mean()

print("A little salary: ")
print(data_little)

print("\nA lot salary: ")
print(data_more)

print("\nA little salary and a lot salary for japan: ")
print(data_little["Japan"], data_more["Japan"])

A little salary: 
native-country
?                             40.164760
Cambodia                      41.416667
Canada                        37.914634
China                         37.381818
Columbia                      38.684211
Cuba                          37.985714
Dominican-Republic            42.338235
Ecuador                       38.041667
El-Salvador                   36.030928
England                       40.483333
France                        41.058824
Germany                       39.139785
Greece                        41.809524
Guatemala                     39.360656
Haiti                         36.325000
Holand-Netherlands            40.000000
Honduras                      34.333333
Hong                          39.142857
Hungary                       31.300000
India                         38.233333
Iran                          41.440000
Ireland                       40.947368
Italy                         39.625000
Jamaica                       38.239437
Japan  