In [None]:
Project 2: Analyzing Census Data using Pandas

#Step 1: Import the necessary libraries (Pandas library to manipulate the data)
#Step 2: Load the data (Read the data from a CSV file into a Pandas dataframe)
#Step 3: Analyze the data (Use various Pandas functions to answer the given questions)
#Step 4: Write the results to variables (Store the results of each analysis in the appropriate variables)

In [None]:
#Step 1
import pandas as pd

#Define the given function (calculate_demographic_data)
def calculate_demographic_data(print_data=True):

#Step 2
df = pd.read_csv('adult.data.csv')

#Step 3
#Question 1: How many people of each race are represented in this dataset?
race_count = df['race'].value_counts()

#Question 2: What is the average age of men?
avg_age_men = df[df['sex']=='male']['age'].mean()

#Question 3: What is the percentage of people who have a Bachelor's Degree?
percentage_bachelors = df[df['education']=='Bachelors'].mean()*100

#Question 4: What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50k
advanced_education = df['education'].isin(['Bachelors', 'Masters', 'Doctorate'])
higher_education_rich = df[advanced_education & (df['salary']=='>50k')].shape[0]/advanced_education.sum()*100

#Question 5: What percentage of people without advanced education make more than 50k?
non_advanced_education = ~df['education'].isin(['Bachelors', 'Masters', 'Doctorate'])
lower_education_rich = df[non_advanced_education & (df['salary']=='>50k')].shape[0]/non_advanced_education.sum()*100

#Question 6: What is the minimum number of hours a person works per week?
minimum_work_hours = df['hours_per_week'].min()

#Question 7: What percentage of the people who work the minimum number of hours per week have a salary of more than 50k?
num_min_workers = df[df['hours_per_week']==minimum_work_hours]
rich_percentage = num_min_workers[num_min_workers['salary']=='>50k'].shape[0]/num_min_workers.shape[0]*100

#Question 8: What country has the highest percentage of people that earn >50k and what is that percentage?
countries_earning = df[df['salary']=='>50k']['native_country'].value_counts()
countries_total = df['native_country'].value_counts()
highest_earning_country_percentage = (countries_earning/countries_total*100).max()
highest_earning_country = (countries_earning/countries_total*100).idmax()

#Question 9: Identify the most popular occupation for those who earn over 50k in India
top_IN_occupation = df[(df['native_country']=='India' & (df['salary']=='>50k')]['occupation'].value_counts().idmax()

#Step 4
if print_data:
    print("Number of each race:\n", race_count)
    print("Average age of men: ", round(avg_age_men, 1))
    print(f"Percentage with Bachelors degrees: {percentage_bachelors}%")
    print(f"Percentage with higher education that earn >50k: {higher_education_rich}%")
    print(f"Percentage without higher education that earn >50k: {lower_education_rich}%")
    print(f"Minimum work time: {min_work_hours} hours/week")
    print(f"Percentage of the rich among those who work fewest hours: {rich_percentage}%")
    print("Country with the highest percentage of rich people", highest_earning_country)
    print(f"Highest percentage of rich people in country: {highest_earning_country_percentage}%")
    print("Top occupations in India", top_IN_occupation)


return {
    'race_count': race_count,
    'avg_age_men': round(avg_age_men, 1),
    'percentage_bachelors': round(percentage_bachelors, 1),
    'higher_education_rich': round(higher_education_rich, 1),
    'lower_education_rich': round(lower_education_rich, 1),
    'minimum_work_hours': minimum_work_hours,
    'rich_percentage': round(rich_percentage, 1),
    'highest_earning_country': highest_earning_country,
    'highest_earning_country_percentage': round(highest_earning_country_percentage, 1),
    'top_IN_occupation': top_IN_occupation

}