# Demographic Data Analyser

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Downloads/adult.data.csv')

In [3]:
df.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


## Count of races in the dataset

In [4]:
race_count = df["race"].value_counts()

In [5]:
print(race_count)

White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64


## Average age of men

In [61]:
age_male = df[(df['sex'] == 'Male')]
average_age_men = round(age_male['age'].mean(),1)
print('The average age of men is {}' .format(average_age_men))

The average age of men is 39.4


## Percentage of people who has a Bachelor degree

In [7]:
bachelors_degree = df[(df['education'] == 'Bachelors')]
percentage_bachelors = (bachelors_degree.shape[0] / df.shape[0]) * 100
print('The percentage of people who has a bachelor degree is {}%' .format(round(percentage_bachelors, 1)))

The percentage of people who has a bachelor degree is 16.4%


In [49]:
masters_degree = df[(df['education'] == 'Masters')]
doctorate_degree = df[(df['education'] == 'Doctorate')]

In [52]:
higher_education = ((bachelors_degree.shape[0] + masters_degree.shape[0] + doctorate_degree.shape[0]) / df.shape[0]) * 100
print('The percentage of people with high education is {}%' .format(round(higher_education, 1)))

The percentage of people with high education is 23.0%


In [53]:
lower_education = 100 - higher_education
print('The percentage of people without high education is {}%' .format(round(lower_education, 1)))

The percentage of people without high education is 77.0%


## People earning more than 50k and their degrees

In [8]:
more50k = df[df['salary'] == '>50K']

### Percentage of Bachelors

In [9]:
df_bachelors_plus50 = more50k[more50k['education'] == 'Bachelors']
perc_bach_more_50k = (df_bachelors_plus50.shape[0] / df.shape[0]) * 100
print('The percentage of bachelors earning more than 50k is {}%' .format(round(perc_bach_more_50k, 1)))

The percentage of bachelors earning more than 50k is 6.8%


### Percentage of Masters

In [10]:
df_masters_plus50 = more50k[more50k['education'] == 'Masters']
perc_master_more_50k = (df_masters_plus50.shape[0] / df.shape[0]) * 100
print('The percentage of masters earning more than 50k is {}%' .format(round(perc_master_more_50k, 1)))

The percentage of masters earning more than 50k is 2.9%


### Percentage of Doctorates

In [11]:
df_doctorate_plus50 = more50k[more50k['education'] == 'Doctorate']
perc_doctorate_more_50k = (df_doctorate_plus50.shape[0] / df.shape[0]) * 100
print('The percentage of doctorates earning more than 50k is {}%' .format(round(perc_doctorate_more_50k, 1)))

The percentage of doctorates earning more than 50k is 0.9%


## Percentage of people eraning more than 50k without advanced education

In [46]:
lower_education_rich = ((more50k.shape[0] - df_bachelors_plus50.shape[0] - df_masters_plus50.shape[0] - df_doctorate_plus50.shape[0]) / more50k.shape[0]) * 100
print('The percentage of people earning more than 50k and hasn´t advanced education is {}%' .format(round(lower_education_rich, 1)))

The percentage of people earning more than 50k and hasn´t advanced education is 55.5%


In [47]:
higher_education_rich = ((df_bachelors_plus50.shape[0] + df_masters_plus50.shape[0] + df_doctorate_plus50.shape[0]) / more50k.shape[0]) * 100
print('The percentage of people earning more than 50k and has advanced education is {}%' .format(round(higher_education_rich, 1)))

The percentage of people earning more than 50k and has advanced education is 44.5%


## Minimum number of hours per week

In [58]:
min_work_hours = df['hours-per-week'].min()
df_min_hours = df[df['hours-per-week'] == min_work_hours]
min_hours_plus50 = df_min_hours[df_min_hours['salary'] == '>50K']
num_min_workers = min_hours_plus50.shape[0]
rich_percentage = (num_min_workers / df_min_hours.shape[0]) * 100
print('The percentage of people who work the minimum number of hours per week and earn more than 50K is {}%' 
      .format(round(rich_percentage, 1)))

The percentage of people who work the minimum number of hours per week and earn more than 50K is 10.0%


In [55]:
df['hours-per-week'].min()

1

## Highest percentage of people earning more than 50K per country

In [59]:
columnA = df.groupby('native-country').size()
columnB = more50k.groupby('native-country').size() #gettin' columns with 
data = pd.concat([columnA, columnB], axis = 1) #Creating a new dataframe with the quantities 
data.columns = ['Total', '>50K']
data['percentage'] = round((data['>50K'] / data['Total']) * 100, 1)
data.sort_values(by = 'percentage', ascending = False, inplace = True)
highest_earning_country = data.index[0]
highest_earning_country_percentage = data.iloc[0,2]
print('The country that has the highest percentage of peaople earning more than 50K is {} with {}%'
      .format(highest_earning_country, highest_earning_country_percentage))

The country that has the highest percentage of peaople earning more than 50K is Iran with 41.9%


## Most popular occupation in India (+50K)

In [60]:
india_more50k = more50k[(more50k['native-country'] == 'India')]
india_more50k_occupation = india_more50k.groupby('occupation').size()
india_more50k_occupation.sort_values(ascending = False, inplace = True)
top_IN_occupation = india_more50k_occupation.index[0]
print('The most popular occupation for those who earn more than 50K in India is {} with {} people'
      .format(top_IN_occupation, india_more50k_occupation.loc[india_more50k_occupation.index[0]]))

#Comentario de prueba

The most popular occupation for those who earn more than 50K in India is Prof-specialty with 25 people
