In [1]:
import numpy as np
import pandas as pd
from io import StringIO
import requests


# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [2]:
url = "https://raw.githubusercontent.com/juliastetsko/py-adult-data-analysis/master/data/adult.csv"

response = requests.get(url, verify=True)

data = pd.read_csv(StringIO(response.text))
cleaned_data = data.replace("?", pd.NA).dropna()
high_income_low_salary_data = cleaned_data[(cleaned_data["salary"] == ">50K") & (cleaned_data["salary K$"] <= 50)]
low_income_high_salary_data = cleaned_data[(cleaned_data["salary"] == "<=50K") & (cleaned_data["salary K$"] > 50)]
high_income_low_salary_data, low_income_high_salary_data

(Empty DataFrame
 Columns: [Unnamed: 0, age, workclass, education, marital-status, occupation, relationship, race, sex, hours-per-week, native-country, salary, salary K$]
 Index: [],
 Empty DataFrame
 Columns: [Unnamed: 0, age, workclass, education, marital-status, occupation, relationship, race, sex, hours-per-week, native-country, salary, salary K$]
 Index: [])

# Task 1
Print the count of men and women in the dataset.

In [None]:
gender_counts = cleaned_data["sex"].value_counts()
female_count = gender_counts["Female"]
male_count = gender_counts["Male"]
female_count, male_count

# Task 2
Find the average age of men in dataset

In [None]:
men_data = cleaned_data[cleaned_data["sex"] == "Male"]
men_data["age"].mean()

# Task 3
Get the percentage of people from Poland (native-country)

In [None]:
poland_count = cleaned_data[cleaned_data["native-country"] == "Poland"].shape[0]
total_count = cleaned_data["native-country"].shape[0]
percentage_poland = (poland_count / total_count) * 100
percentage_poland

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [None]:
high_income_data = cleaned_data[cleaned_data["salary"] == ">50K"]
mean_age_high_income = high_income_data["age"].mean()
std_age_high_income = high_income_data["age"].std()
low_income_data = cleaned_data[cleaned_data["salary"] == "<=50K"]
mean_age_low_income= low_income_data["age"].mean()
std_age_low_income = low_income_data["age"].std()
(mean_age_high_income,std_age_high_income), (mean_age_low_income, std_age_low_income)

# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [None]:
higher_education_levels = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]
high_income_low_education_data = cleaned_data[(cleaned_data["salary"] == ">50K") & (~cleaned_data["education"].isin(higher_education_levels))]
high_income_low_education_data

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [None]:
education_age_stats = cleaned_data.groupby("education")["age"]
education_age_stats.describe()

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [None]:
married_men = men_data[men_data["marital-status"].str.startswith("Married")]
not_married_men = men_data[~men_data["marital-status"].str.startswith("Married")]

mean_salary_married = married_men["salary K$"].mean()
mean_salary_non_married =  not_married_men["salary K$"].mean()
(mean_salary_married, mean_salary_non_married)

# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [None]:
max_hours = cleaned_data["hours-per-week"].max()
num_people = (cleaned_data["hours-per-week"] == max_hours).sum()
num_people

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [None]:
numeric_columns = cleaned_data.select_dtypes(include=["int64", "float64"])
correlation_matrix = numeric_columns.corr()
correlation_matrix