In [1]:
import pandas as pd

 # Read data from file

In [2]:
df = pd.read_csv('adult.data.csv')

## This displays the first 5 rows

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## The dataset has 32561 rows and 15 columns

In [4]:
df.shape

(32561, 15)

## The dataset has no missing values in every row

In [5]:
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
salary            0
dtype: int64

## The dataset has 24 duplicated rows 

In [6]:
df[df.duplicated()].shape

(24, 15)

## This drops all duplicate rows

In [7]:
df.drop_duplicates(inplace=True)

## This verifies if the drop was successful

In [8]:
df[df.duplicated()].shape

(0, 15)

## This display the distinct races in the dataframe

In [9]:
df.race.unique()

array(['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo',
       'Other'], dtype=object)

 # How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.

## Option 1

In [10]:
df.race.value_counts()

White                 27795
Black                  3122
Asian-Pac-Islander     1038
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

## Option 2

In [11]:
df.groupby('race').size().sort_values()

race
Other                   271
Amer-Indian-Eskimo      311
Asian-Pac-Islander     1038
Black                  3122
White                 27795
dtype: int64

# What is the average age of men?

## This gets the distinct sex options in the dataset

In [12]:
df.sex.unique()

array(['Male', 'Female'], dtype=object)

## This creates a dataframe storing all the male data

In [13]:
male_df = df[df.sex == 'Male']

## This displays the first 5 rows of the dataframe with the male information

In [14]:
male_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K


## The average age of men is 39.4 i.e 39 years

In [15]:
male_df.age.mean()

39.43605051664753

# What is the percentage of people who have a Bachelor's degree?

## This shows the different types of degrees available in the dataset

In [16]:
df.education.unique()

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
       '5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)

## This gets the number of people who have a bachelors degree

In [17]:
bachelors_degree = df[df.education == 'Bachelors'].shape[0]
bachelors_degree

5353

## This gets the total popultion of people in the dataset

In [18]:
total_people = df.shape[0]
total_people

32537

## This shows that 16.4% of the people in the dataset have a bachelors degree

In [19]:
(bachelors_degree / total_people) * 100

16.45203921689154

# What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?

## This gets the people with education being Bachelors or Masters or Doctorate

In [20]:
higher_education_people = df[df.education.isin(['Bachelors', 'Masters', 'Doctorate'])]

## This displays the first 5 rows of that information

In [21]:
higher_education_people.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K


## This gets the total number of people with Bachelors, Masters and Doctorate degree

In [22]:
people_higher_education = higher_education_people.shape[0]
people_higher_education

7488

## This shows the distinct salaries available

In [23]:
higher_education_people.salary.unique()

array(['<=50K', '>50K'], dtype=object)

## This gets the total number of people with Bachelors, Masters and Doctorate degree with their salary greater than 50K

In [24]:
people_higher_education_salary_greater = higher_education_people[higher_education_people.salary == '>50K'].shape[0]
people_higher_education_salary_greater

3486

## This shows that 46.6% of the people with advanced education have their salaries greater than 50K

In [25]:
(people_higher_education_salary_greater / people_higher_education) * 100

46.55448717948718

# What percentage of people without advanced education make more than 50K?

## This gets the people with lower education

In [26]:
lower_education_people = df[~df.education.isin(['Bachelors', 'Masters', 'Doctorate'])]

## This displays the first 5 rows

In [27]:
lower_education_people.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
10,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K


## This gets the total number of people with lower education

In [28]:
people_lower_education = lower_education_people.shape[0]
people_lower_education

25049

## This gets the number of people who have lower education and make more than 50K

In [29]:
people_lower_education_salary_greater = lower_education_people[lower_education_people.salary == '>50K'].shape[0]
people_lower_education_salary_greater

4353

## This shows that 17.4% of the people with lower education make more than 50K

In [30]:
(people_lower_education_salary_greater / people_lower_education) * 100

17.37793923909138

 # What is the minimum number of hours a person works per week (hours-per-week feature)?

## This shows that the minimun number of hours a person works per week is 1

In [31]:
df['hours-per-week'].min()

1

## This gets the information of all those who work for an hour in a  week

In [32]:
hour_per_week = df[df['hours-per-week'] == 1]

In [33]:
hour_per_week.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
189,58,State-gov,109567,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,1,United-States,>50K
1036,66,Self-emp-inc,150726,9th,5,Married-civ-spouse,Exec-managerial,Husband,White,Male,1409,0,1,?,<=50K
1262,69,?,195779,Assoc-voc,11,Widowed,?,Not-in-family,White,Female,0,0,1,United-States,<=50K
5590,78,?,363134,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,0,1,United-States,<=50K
5632,45,?,189564,Masters,14,Married-civ-spouse,?,Wife,White,Female,0,0,1,United-States,<=50K


 # What percentage of the people who work the minimum number of hours per week have a salary of >50K?

## This shows that 20 people work for 1 hour a week

In [34]:
min_hour_workers = hour_per_week.shape[0]
min_hour_workers

20

## This shows that 90% of the people who work for an hour a week are people of lower education level

In [35]:
hour_per_week.education.value_counts()

HS-grad         7
Assoc-voc       3
Some-college    3
10th            3
Doctorate       1
9th             1
Masters         1
Assoc-acdm      1
Name: education, dtype: int64

## This shows that out of the 20 people only 2 have a salary greater than 50K

In [36]:
min_hour_rich = hour_per_week[hour_per_week.salary == '>50K'].shape[0]
min_hour_rich

2

## This shows that 10% of the people who work for an hour a week have a salary greater than 50K

In [37]:
(min_hour_rich / min_hour_workers) * 100

10.0

# What country has the highest percentage of people that earn >50K?

## This gets all the information about people who earn greater than 50K

In [38]:
salary_greater_50 = df[df.salary == '>50K']

## This shows the first 5 rows

In [39]:
salary_greater_50.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
10,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K


## This shows that there are a total of 7839 people who earn greater than 50K

In [40]:
greater_50_total = salary_greater_50.shape[0]
greater_50_total

7839

## This gets the number of people in each country that earns greater than 50K

In [41]:
country_salary = salary_greater_50['native-country'].value_counts().reset_index().rename(columns={'index': 'native-country', 'native-country':'count'})

In [42]:
country_salary

Unnamed: 0,native-country,count
0,United-States,7169
1,?,146
2,Philippines,61
3,Germany,44
4,India,40
5,Canada,39
6,Mexico,33
7,England,30
8,Italy,25
9,Cuba,25


## This gets the country the highest number of people earning greater than 50K

In [43]:
highest_salary_country_info = country_salary[country_salary['count'] == country_salary['count'].max()]

In [44]:
highest_salary_country_info

Unnamed: 0,native-country,count
0,United-States,7169


## This shows that the country with the most people earning greater than 50K is United-States

In [45]:
highest_country_name = highest_salary_country_info['native-country'].values[0]
highest_country_name

'United-States'

## This shows that 7169 0f the 7839 people are people from United-States

In [46]:
highest_country_number = highest_salary_country_info['count'].values[0]
highest_country_number

7169

## This shows that 91.5% of the people earning more than 50K are from United-States

In [47]:
(highest_country_number / greater_50_total) * 100

91.45299145299145

## This gets the total number of people in United-States and they are 29153 in total

In [48]:
country_pop = df[df['native-country'] == highest_country_name].shape[0]
country_pop

29153

## This shows that 24.6% of the people United-States earn more than 50K

In [49]:
(highest_country_number / country_pop) * 100

24.59095118855692

# Identify the most popular occupation for those who earn >50K in India.

## This gets the information about people of India that earn more than 50K

In [50]:
india = salary_greater_50[salary_greater_50['native-country'] == 'India']

## This displays the first 5 rows

In [51]:
india.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
968,48,Private,164966,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
1327,52,Private,168381,HS-grad,9,Widowed,Other-service,Unmarried,Asian-Pac-Islander,Female,0,0,40,India,>50K
7258,42,State-gov,102343,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,72,India,>50K
7285,54,State-gov,93449,Masters,14,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K


## This groups the data by the occupations

In [52]:
india_occupation = india.occupation.value_counts().reset_index().rename(columns={'index':'occupation', 'occupation':'counts'})

In [53]:
india_occupation

Unnamed: 0,occupation,counts
0,Prof-specialty,25
1,Exec-managerial,8
2,Other-service,2
3,Tech-support,2
4,Transport-moving,1
5,Sales,1
6,Adm-clerical,1


## This shows that the profession in India that produces the most people with salary greater than 50K is Prof-specialty

In [54]:
top_indian_profession = india_occupation[india_occupation.counts == india_occupation.counts.max()].occupation.values[0]
top_indian_profession

'Prof-specialty'