In [44]:
import pandas as pd
import numpy as np
import uci_dataset as dataset

In [3]:

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(url, index_col=False, sep=',', names=['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','salary'])

df.sample(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
10189,39,Self-emp-not-inc,52870,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,>50K
10873,42,Local-gov,137232,HS-grad,9,Divorced,Protective-serv,Unmarried,White,Female,0,0,50,United-States,<=50K
24774,51,Private,21698,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K


In [23]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
salary            object
dtype: object

In [33]:
cols = df.select_dtypes(object).columns
df[cols] = df[cols].apply(lambda x: x.str.strip())

In [110]:
df.shape

(32561, 15)

### How many people of each race are represented in this dataset? 

In [140]:
print('People of each race')
print(df.race.value_counts().to_frame())

People of each race
                     race
White               27816
Black                3124
Asian-Pac-Islander   1039
Amer-Indian-Eskimo    311
Other                 271


### What is the average age of men?

In [174]:
# q1 = np.average(df[(df.sex=='Male')].age)
# print(f'Average age of men {q1}')

df[df.sex == 'Male'].age.agg('mean')

39.43354749885268

### What is the percentage of people who have a Bachelor's degree?

In [142]:
q2 = df.education.value_counts(normalize=True)[2:3][0]
print(f'Percentage of peaple who have a bachelors degree : {q2}')

Percentage of peaple who have a bachelors degree : 0.16446055096587942


### What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?

In [143]:
q3 = (df[((df.education == 'Bachelors') | (df.education == 'Masters') | (df.education == 'Doctorate')) & (df.salary == '>50K')].shape[0]/df.shape[0])*100
print(f'percentage of people with advanced education and make more than 50K : {q3}')

percentage of people with advanced education and make more than 50K : 10.706059396210192


### What percentage of people without advanced education make more than 50K?

In [144]:
q4 = df[~((df.education == 'Bachelors') | (df.education == 'Masters') | (df.education == 'Doctorate')) & (df.salary == '>50K')].shape[0]/df.shape[0]*100
print(f'percentage of people without advanced education make more than 50K : {q4}')

percentage of people without advanced education make more than 50K : 13.374896348392248


### What is the minimum number of hours a person works per week?

In [145]:
q5 = df['hours-per-week'].agg('min')
print(f'minimum number of hours a person works per week : {q5}')

minimum number of hours a person works per week : 1


### What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?

In [146]:
q6 = df[(df['hours-per-week'] == df['hours-per-week'].agg('min')) & (df.salary == '>50K')].shape[0]/df.shape[0]*100
print(f'percentage of the people who work the minimum number of hours per week have a salary of more than 50K : {q6}')

percentage of the people who work the minimum number of hours per week have a salary of more than 50K : 0.006142317496391388


### What country has the highest percentage of people that earn >50K and what is that percentage?

In [165]:
q7 = df[['native-country','salary']][df.salary == '>50K'].value_counts(normalize=True)[0:1].to_frame().rename(columns={0:'%'})
print(q7)

                              %
native-country salary          
United-States  >50K    0.914552


### Identify the most popular occupation for those who earn >50K in India.

In [170]:
q8 = df[(df['native-country'] == 'India') & (df['salary'] == '>50K')].occupation.value_counts()
print('the most popular occupation for those who earn >50K in India')
print(q8)

the most popular occupation for those who earn >50K in India
Prof-specialty      25
Exec-managerial      8
Other-service        2
Tech-support         2
Transport-moving     1
Sales                1
Adm-clerical         1
Name: occupation, dtype: int64
