### DEMOGRAPHIC ANALYZER

In [1]:
import pandas as pd
import math

In [2]:
df = pd.read_csv('adult.data.csv')

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Exploration and Cleaning

In [4]:
df.shape

(32561, 15)

In [5]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [6]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [7]:
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
salary            0
dtype: int64

### How many people of each race are represented in this dataset?

In [20]:
df_race = df.groupby('race').size().sort_values(ascending=False)
print('How many people of each race are represented in this dataset?')
print(df_race)

How many people of each race are represented in this dataset?
race
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
dtype: int64


### What is the average age of men?

In [9]:
df_men_age_avg = df[df['sex'] == 'Male']['age']
men_age_avg = df_men_age_avg.mean()
print('What is the average age of men?')
print(round(men_age_avg, 1))

What is the average age of men?
39.4


### What is the percentage of people who have a Bachelor's degree?

In [10]:
df_bachelors_degree = df[df['education'] == 'Bachelors']
bachelors_degree_percentage = (df_bachelors_degree.size / df.size) * 100
print("What is the percentage of people who have a Bachelor's degree?")
print('{:.2f}%'.format(bachelors_degree_percentage.item()))

What is the percentage of people who have a Bachelor's degree?
16.45%


###  What percentage of people with advanced education (Bachelors, Masters, or Doctorate) ?

In [11]:
df_advanced_education = df[(df['education'] == 'Bachelors') | (df['education'] == 'Masters') | (df['education'] == 'Doctorate')]
percentage_advanced_education = (df_advanced_education.size / df.size) * 100
print('What percentage of people without advanced education ?')
print('{:.2f}%'.format(percentage_advanced_education.item()))

What percentage of people without advanced education ?
23.01%


### What percentage of people without advanced education ?

In [12]:
df_non_advanced_education = df[~((df['education'] == 'Bachelors') | (df['education'] == 'Masters') | (df['education'] == 'Doctorate'))]
percentage_non_advanced_education = (df_non_advanced_education.size / df.size) * 100
print('What percentage of people without advanced education ?')
print('{:.2f}%'.format(percentage_non_advanced_education.item()))

What percentage of people without advanced education ?
76.99%


### What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?

In [13]:
df_advanced_education = df[(df['education'] == 'Bachelors') | (df['education'] == 'Masters') | (df['education'] == 'Doctorate')]
df_more_than_50 = df_advanced_education[df_advanced_education['salary'] == '>50K']
more_than_50_percentage = (df_more_than_50.size / df_advanced_education.size) * 100
print('What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?')
print('{:.2f}%'.format(more_than_50_percentage.item()))

What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?
46.54%


### What percentage of people without advanced education make more than 50K?

In [14]:
df_not_advanced_education = df[~((df['education'] == 'Bachelors') | (df['education'] == 'Masters') | (df['education'] == 'Doctorate'))]
df_more_than_50 = df_not_advanced_education[df_not_advanced_education['salary'] == '>50K']
more_than_50_percentage = (df_more_than_50.size / df_not_advanced_education.size) * 100
print('What percentage of people without advanced education make more than 50K?')
print('{:.2f}%'.format(more_than_50_percentage.item()))

What percentage of people without advanced education make more than 50K?
17.37%


### What is the minimum number of hours a person works per week?

In [15]:
min_hours_per_week = df['hours-per-week'].min()
print('What is the minimum number of hours a person works per week?')
print(min_hours_per_week)

What is the minimum number of hours a person works per week?
1


### What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?

In [16]:
min_hours_per_week = df['hours-per-week'].min()
df_min_hours_per_week = df[df['hours-per-week'] == min_hours_per_week]
df_salary_more_than_50 = df_min_hours_per_week[df_min_hours_per_week['salary'] == '>50K']
percentage_min_hours_per_week = (df_salary_more_than_50.size / df_min_hours_per_week.size) * 100
print('What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?')
print('{:.2f}%'.format(percentage_min_hours_per_week))

What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?
10.00%


### What country has the highest percentage of people that earn >50K and what is that percentage?

In [24]:
df_country_high_salary = df.groupby(['native-country', 'salary'])['salary'].agg({'count'}).unstack().reset_index()
df_country_high_salary['total'] = df_country_high_salary['count']['<=50K'] + df_country_high_salary['count']['>50K']
df_country_high_salary['percentage'] = round((df_country_high_salary['count']['>50K'] / df_country_high_salary['total']) * 100, 2)
df_country_high_salary = df_country_high_salary.sort_values('percentage', ascending=False).reset_index()
country_high_salary = df_country_high_salary['native-country'][0]
percentage_country_high_salary = round(df_country_high_salary['percentage'][0], 1)
print('What country has the highest percentage of people that earn >50K and what is that percentage?')
print('{:s}, the percentage is {:.2f}%'.format(country_high_salary, percentage_country_high_salary))

What country has the highest percentage of people that earn >50K and what is that percentage?
Iran, the percentage is 41.90%


### Identify the most popular occupation for those who earn >50K in India.

In [18]:
df_india_country = df[df['native-country'] == 'India']
df_more_than_50 = df_india_country[df_india_country['salary'] == '>50K']
df_popular_occupation = df_more_than_50.groupby('occupation').size().sort_values(ascending=False)
popular_occupation = df_popular_occupation[df_popular_occupation == df_popular_occupation[0]].index[0]
print('Identify the most popular occupation for those who earn >50K in India?')
print(popular_occupation)

Identify the most popular occupation for those who earn >50K in India?
Prof-specialty
