In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max.columns', 100)
# to draw pictures in jupyter notebook
%matplotlib inline 
import matplotlib.pyplot as plt
import seaborn as sns
# we don't like warnings
# you can comment the following 2 lines if you'd like to
import warnings
warnings.filterwarnings('ignore')

In [3]:
%config IPCompleter.greedy=True

In [4]:
data = pd.read_csv('adult.data.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
data['sex'].value_counts()

Male      21790
Female    10771
Name: sex, dtype: int64

In [22]:
#средний возраст женщин
data.loc[data['sex'] == 'Female', 'age'].mean().round()

37.0

In [23]:
#percentage of German citizens
(data['native-country'] == 'Germany').sum() / data.shape[0]

0.004207487485028101

In [8]:
age1 = data.loc[data['salary'] == '<=50K','age'].mean().round()
dev1 = data.loc[data['salary'] == '>50K','age'].std().round()
age2 = data.loc[data['salary'] == '>50K','age'].mean().round()
dev2 = data.loc[data['salary'] == '>50K','age'].std().round()

print("mean and standard deviation of age for those who earn more than 50K per year {}+-{}, less than 50K per year: {}+-{}".format(
        age1,dev1, age2, dev2))

mean and standard deviation of age for those who earn more than 50K per year 37.0+-11.0, less than 50K per year: 44.0+-11.0


In [9]:
data.loc[data['salary'] == '>50K', 'education'].unique()

array(['HS-grad', 'Masters', 'Bachelors', 'Some-college', 'Assoc-voc',
       'Doctorate', 'Prof-school', 'Assoc-acdm', '7th-8th', '12th',
       '10th', '11th', '9th', '5th-6th', '1st-4th'], dtype=object)

In [24]:
#age statistics for each race
data.groupby('race')['age'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Amer-Indian-Eskimo,311.0,37.173633,12.44713,17.0,28.0,35.0,45.5,82.0
Asian-Pac-Islander,1039.0,37.746872,12.825133,17.0,28.0,36.0,45.0,90.0
Black,3124.0,37.767926,12.75929,17.0,28.0,36.0,46.0,90.0
Other,271.0,33.457565,11.538865,17.0,25.0,31.0,41.0,77.0
White,27816.0,38.769881,13.782306,17.0,28.0,37.0,48.0,90.0


In [25]:
#and each gender
data.groupby('sex')['age'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Female,10771.0,36.85823,14.013697,17.0,25.0,35.0,46.0,90.0
Male,21790.0,39.433547,13.37063,17.0,29.0,38.0,48.0,90.0


In [12]:
#женатые богатые
data[(data['sex'] == 'Male')&(data['salary'] == '>50K')&(data['relationship'] == 'Husband')].shape[0]

5918

In [13]:
#неженатые богатые
data[(data['sex'] == 'Male')&(data['salary'] == '>50K')&(data['relationship'] != 'Husband')].shape[0]

744

In [14]:
#maximum number of hours a person works per week
data['hours-per-week'].max()

99

In [15]:
#сколько чел работают по 99 ч/неделю
data[data['hours-per-week']==99].shape[0]

85

In [16]:
#какой процент из них зарабатывает >50K 
data[(data['hours-per-week']==99)&(data['salary']== '>50K')].shape[0] / data[data['hours-per-week']==99].shape[0]

0.29411764705882354

In [17]:
data.loc[data['salary'] == ">50K", 'hours-per-week'].mean()

45.473026399693914

In [19]:
for (country, salary), sub_df in data.groupby(['native-country', 'salary']):
    print(country, salary, round(sub_df['hours-per-week'].mean(), 2))

? <=50K 40.16
? >50K 45.55
Cambodia <=50K 41.42
Cambodia >50K 40.0
Canada <=50K 37.91
Canada >50K 45.64
China <=50K 37.38
China >50K 38.9
Columbia <=50K 38.68
Columbia >50K 50.0
Cuba <=50K 37.99
Cuba >50K 42.44
Dominican-Republic <=50K 42.34
Dominican-Republic >50K 47.0
Ecuador <=50K 38.04
Ecuador >50K 48.75
El-Salvador <=50K 36.03
El-Salvador >50K 45.0
England <=50K 40.48
England >50K 44.53
France <=50K 41.06
France >50K 50.75
Germany <=50K 39.14
Germany >50K 44.98
Greece <=50K 41.81
Greece >50K 50.62
Guatemala <=50K 39.36
Guatemala >50K 36.67
Haiti <=50K 36.33
Haiti >50K 42.75
Holand-Netherlands <=50K 40.0
Honduras <=50K 34.33
Honduras >50K 60.0
Hong <=50K 39.14
Hong >50K 45.0
Hungary <=50K 31.3
Hungary >50K 50.0
India <=50K 38.23
India >50K 46.48
Iran <=50K 41.44
Iran >50K 47.5
Ireland <=50K 40.95
Ireland >50K 48.0
Italy <=50K 39.62
Italy >50K 45.4
Jamaica <=50K 38.24
Jamaica >50K 41.1
Japan <=50K 41.0
Japan >50K 47.96
Laos <=50K 40.38
Laos >50K 40.0
Mexico <=50K 40.0
Mexico >50K 46