### Importing relevant libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

In [2]:
# setting few options 
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 3)

### Loading the data

In [3]:
adult = pd.read_csv('adult.csv')
adult.head(3)

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K


In [34]:
# let's fix the columns for the data
adult.columns = ['age', 'workclass', 'fnlwght', 'education', 'education-num', 'marital-status', \
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', \
        'hours-per-week', 'native-country', 'salary']
adult.head()

Unnamed: 0,age,workclass,fnlwght,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [5]:
adult.sex.unique(), adult.education.unique()

(array([' Male', ' Female'], dtype=object),
 array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
        ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
        ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
        ' Preschool', ' 12th'], dtype=object))

In [6]:
adult.info()

# there no null values in the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32560 entries, 0 to 32559
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32560 non-null  int64 
 1   workclass       32560 non-null  object
 2   fnlwght         32560 non-null  int64 
 3   education       32560 non-null  object
 4   education-num   32560 non-null  int64 
 5   marital-status  32560 non-null  object
 6   occupation      32560 non-null  object
 7   relationship    32560 non-null  object
 8   race            32560 non-null  object
 9   sex             32560 non-null  object
 10  capital-gain    32560 non-null  int64 
 11  capital-loss    32560 non-null  int64 
 12  hours-per-week  32560 non-null  int64 
 13  native-country  32560 non-null  object
 14  salary          32560 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


#### Q1. How many people of each race are represented in this dataset?

In [7]:
adult['race'].value_counts()

 White                 27815
 Black                  3124
 Asian-Pac-Islander     1039
 Amer-Indian-Eskimo      311
 Other                   271
Name: race, dtype: int64

#### Q2 What is the average age of men?

In [8]:
# adult['sex'].loc[adult['sex'] == 'Male']
np.average(adult['age'].loc[adult['sex'] == 'Male'])

# the result gives an empty string which is not normal cuz we have 'Male' and 'Female' values
# in the 'sex' column. Inspecting further shows that there are leading and/or trailing spaces
# in the columns. Hence we have to remove them


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


nan

In [9]:
adult['sex'] = adult['sex'].str.replace(" ", "")

In [10]:
# if we then rerun the previous code, it should work (I rounded the result to 2 dp)
avg_age_men = round(np.average(adult['age'].loc[adult['sex'] == 'Male']), 2)
avg_age_men

39.43

#### Q3. What is the percentage of people who have a Bachelor's degree?

In [11]:
# To answer this, I assume that all those with degress above bachelor's already have bachelors

bachelors_and_above = ['Bachelors', 'Masters', 'Doctorate']

# we need to strip the trailing spaces off the education column
adult['education'] = adult['education'].str.replace(" ", "")


In [12]:
graduates = adult.loc[adult['education'].isin(bachelors_and_above)]

In [13]:
# therefore the percentage of graduates to the total number of adults (rounded to 2d.p) is
"{:.2%}".format( graduates.shape[0] / adult.shape[0] )

'23.00%'

#### Q.4 What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?

In [14]:
# we already have a df with those with advanced education (graduates df)
graduates.salary.unique()

array([' <=50K', ' >50K'], dtype=object)

In [15]:
# so let's get those in this list earning > 50K
graduates.salary = graduates.salary.str.replace(" ", "")
grads_above_50k = graduates.loc[graduates['salary'].isin(['>50K'])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [16]:
grads_above_50k.head(3)

Unnamed: 0,age,workclass,fnlwght,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
7,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
8,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
10,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K


In [17]:
# percentage of graduates earning above 50K equals:
"{:.2%}".format(grads_above_50k.shape[0] / adult.shape[0])

'10.71%'

#### Q.5 What percentage of people without advanced education make more than 50K?

In [18]:
# dataframe of people without higher education
non_graduates = adult.loc[~(adult['education'].isin(bachelors_and_above))]

# let's strip the extra spaces in the salary column
non_graduates['salary'] = non_graduates.salary.str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_graduates['salary'] = non_graduates.salary.str.strip()


In [19]:
# df of non_graduates above salary of 50K
non_graduates_above_50k = non_graduates[['education','salary']].loc[non_graduates['salary'].isin(['>50K'])]
non_graduates_above_50k.head(3)

Unnamed: 0,education,salary
6,HS-grad,>50K
9,Some-college,>50K
13,Assoc-voc,>50K


In [20]:
# calculating the perceptage 
(round((non_graduates_above_50k.shape[0] / adult.shape[0]) * 100, 2))

13.38

#### Q6. Min no. of hours per week by any individual

In [21]:
adult['hours-per-week'].min()

1

#### Q7. What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?

In [22]:
workers_min_hours = adult.loc[adult['hours-per-week'] == adult['hours-per-week'].min()]

In [23]:
#workers_min_hours.salary = workers_min_hours.salary.str.replace()
workers_min_hours_above_50k = workers_min_hours.loc[adult['salary'].isin([' >50K'])]

round( (workers_min_hours_above_50k.shape[0] / adult.shape[0] ) * 100, 2)

# ans: 0.01%


0.01

#### Q8. What country has the highest percentage of people that earn >50K and what is that percentage?

In [24]:
# people earning over 50k
over_50K = adult.loc[adult['salary'].isin([' >50K'])]

In [25]:
# grouping the people that earn over 50K by native-countries
grpby_country = over_50K[['native-country', 'salary']].groupby(['native-country']).count()

# creating a new column in the grpby_country df
grpby_country['%age total'] = round( ( grpby_country['salary'] / grpby_country['salary'].sum() ) * 100, 2)

In [26]:
# sorting the results in descending order to see the country with the highest number
# of people, by percentage, earning above 50K

grpby_country['%age total'].sort_values(ascending=False)

native-country
 United-States      91.46
 ?                   1.86
 Philippines         0.78
 Germany             0.56
 India               0.51
                    ...  
 Peru                0.03
 Trinadad&Tobago     0.03
 Columbia            0.03
 Nicaragua           0.03
 Honduras            0.01
Name: %age total, Length: 40, dtype: float64

#### Q9. Identify the most popular occupation for those who earn >50K in India.

In [27]:
over_50K_india = over_50K.loc[over_50K['native-country'] == ' India']

In [28]:
over_50K_india.head(3)

Unnamed: 0,age,workclass,fnlwght,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
10,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
967,48,Private,164966,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
1326,52,Private,168381,HS-grad,9,Widowed,Other-service,Unmarried,Asian-Pac-Islander,Female,0,0,40,India,>50K


In [29]:
over_50K_india['occupation'].value_counts()

# most popular occupation for Indians who earn above 50K is Prof-specialty

 Prof-specialty      25
 Exec-managerial      8
 Other-service        2
 Tech-support         2
 Transport-moving     1
 Sales                1
 Adm-clerical         1
Name: occupation, dtype: int64

In [30]:
over_50K_india.head()

Unnamed: 0,age,workclass,fnlwght,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
10,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
967,48,Private,164966,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
1326,52,Private,168381,HS-grad,9,Widowed,Other-service,Unmarried,Asian-Pac-Islander,Female,0,0,40,India,>50K
7257,42,State-gov,102343,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,72,India,>50K
7284,54,State-gov,93449,Masters,14,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K


##### Q9. Alternative option

In [31]:
# Another option is to use the code below (howbeit more complicated)

indians_above_50k = adult.loc[( adult['native-country'] == ' India' ) & ( adult['salary'].isin([' >50K']) )]
indians_above_50k

Unnamed: 0,age,workclass,fnlwght,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
10,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
967,48,Private,164966,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
1326,52,Private,168381,HS-grad,9,Widowed,Other-service,Unmarried,Asian-Pac-Islander,Female,0,0,40,India,>50K
7257,42,State-gov,102343,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,72,India,>50K
7284,54,State-gov,93449,Masters,14,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30110,41,Federal-gov,219155,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,India,>50K
30151,48,Private,119471,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
30832,25,Private,110978,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Wife,Asian-Pac-Islander,Female,0,0,37,India,>50K
31326,38,State-gov,125499,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,7688,0,60,India,>50K


In [36]:
# check
indians_above_50k == over_50K_india , \
print(indians_above_50k.shape, over_50K_india.shape)

(40, 15) (40, 15)


(        age  workclass  fnlwght  education  education-num  marital-status  \
 10     True       True     True       True           True            True   
 967    True       True     True       True           True            True   
 1326   True       True     True       True           True            True   
 7257   True       True     True       True           True            True   
 7284   True       True     True       True           True            True   
 ...     ...        ...      ...        ...            ...             ...   
 30110  True       True     True       True           True            True   
 30151  True       True     True       True           True            True   
 30832  True       True     True       True           True            True   
 31326  True       True     True       True           True            True   
 31356  True       True     True       True           True            True   
 
        occupation  relationship  race   sex  capital-gain  ca

In [33]:
indians_above_50k['occupation'].value_counts()

 Prof-specialty      25
 Exec-managerial      8
 Other-service        2
 Tech-support         2
 Transport-moving     1
 Sales                1
 Adm-clerical         1
Name: occupation, dtype: int64