# Demographic Data Analyzer

In [1]:
import pandas as pd

## Read data from file

In [2]:
df = pd.read_csv('adult.data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


## How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.

In [3]:
race_count = df['race'].value_counts()
print('RACE COUNTS', race_count)

RACE COUNTS White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64


## What is the average age of men?

In [4]:
sex = df['sex'].value_counts()
print(sex)

Male      21790
Female    10771
Name: sex, dtype: int64


In [5]:
men = df.loc[df['sex'] == 'Male', 'age']
average_age_men = round(df.loc[df['sex'] == 'Male', 'age'].mean(), 1)

In [6]:
print('AVG AGE OF MEN', average_age_men)

AVG AGE OF MEN 39.4


## What is the percentage of people who have a Bachelor's degree?

In [7]:
total_bachelors = (df['education'] == 'Bachelors').sum()
percentage_bachelors = round(total_bachelors * 100 / len(df), 1)
print('% OF PEOPLE WHO HAVE BACHELORS', percentage_bachelors)

% OF PEOPLE WHO HAVE BACHELORS 16.4


## What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?
## What percentage of people without advanced education make more than 50K?

In [8]:
bachelor = df['education'] == 'Bachelors'
master = df['education'] == 'Masters'
doctor = df['education'] == 'Doctorate'

In [9]:
higher_education_group = bachelor | master | doctor
print('HIGHER EDUCATION GROUP', higher_education_group.sum())

HIGHER EDUCATION GROUP 7491


In [10]:
lower_education_group = ((df['education'] != 'Bachelors') & (df['education'] != 'Masters') & (df['education'] != 'Doctorate')).sum()
print('LOWER EDUCATION GROUP', lower_education_group)

LOWER EDUCATION GROUP 25070


In [11]:
hi_ed_rich = df.loc[higher_education_group & (df['salary'] == '>50K')].value_counts().sum()
print(hi_ed_rich)

3486


In [12]:
hi_ed_total = higher_education_group.sum()
print(hi_ed_total)

7491


In [13]:
higher_education = round(hi_ed_rich * 100 / hi_ed_total, 1)
print('HIGHER EDUCATION MAKING > $50K:', higher_education,'%')

HIGHER EDUCATION MAKING > $50K: 46.5 %


In [14]:
lo_ed_rich = df.loc[~higher_education_group & (df['salary'] == '>50K')].value_counts().sum()
print(lo_ed_rich)

4355


In [15]:
lo_ed_total = lower_education_group.sum()
print(lo_ed_total)

25070


In [16]:
lower_education = round(lo_ed_rich * 100 / lo_ed_total, 1)
print('LOWER EDUCATION MAKING > $50K:', lower_education, '%')

LOWER EDUCATION MAKING > $50K: 17.4 %


## What is the minimum number of hours a person works per week (hours-per-week feature)?

In [17]:
min_work_hours = df['hours-per-week'].min()
print('MINIMUM HOURS WORKED', min_work_hours)

MINIMUM HOURS WORKED 1


## What percentage of the people who work the minimum number of hours per week have a salary of >50K?

In [18]:
num_min_workers = df.loc[df['hours-per-week'] == 1].value_counts().sum()
print('# OF PEOPLE WHO WORK MINIMUM HOURS:', num_min_workers)

# OF PEOPLE WHO WORK MINIMUM HOURS: 20


In [19]:
min_hours_rich = df.loc[(df['hours-per-week'] == 1) & (df['salary'] == '>50K')].value_counts().sum()
rich_percentage = round(min_hours_rich * 100 / num_min_workers, 1)
print('% OF PEOPLE WHO WORK THE MINIMUM NUMBER OF HOURS WHO MAKE >50K:', rich_percentage, '%')

% OF PEOPLE WHO WORK THE MINIMUM NUMBER OF HOURS WHO MAKE >50K: 10.0 %


## What country has the highest percentage of people that earn >50K?

In [20]:
rich_country = df.loc[(df['salary'] == '>50K'), 'native-country'].value_counts()
country_pop = df['native-country'].value_counts()
rich_percent_by_country = round(rich_country * 100 / country_pop, 1)
highest_earning_country = rich_percent_by_country.idxmax()
print('HIGHEST EARNING COUNTRY', highest_earning_country)

HIGHEST EARNING COUNTRY Iran


In [21]:
iran_rich = df.loc[(df['native-country'] == 'Iran') & (df['salary'] == '>50K')].value_counts().sum()
iran_pop = df.loc[(df['native-country'] == 'Iran')].value_counts().sum()
highest_earning_country_percentage = round(iran_rich * 100 / iran_pop, 1)
print("HIGHEST EARNING COUNTRY'S POPULATION % THAT EARN >50K", highest_earning_country_percentage)

HIGHEST EARNING COUNTRY'S POPULATION % THAT EARN >50K 41.9


## Identify the most popular occupation for those who earn >50K in India.

In [23]:
top_IN_occupation = df.loc[(df['salary'] == '>50K') & (df['native-country'] == 'India'), 'occupation'].value_counts().idxmax()
print(top_IN_occupation)

Prof-specialty
