In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [18]:
df = pd.read_csv(r"C:\Users\Alex\py-adult-data-analysis\data\adult.csv")
df = df[~df.apply(lambda row: row.astype(str).str.contains(r'\?').any(), axis=1)]

def is_salary_correct(row):
    salary_value = row["salary"]
    salary_c_value = row["salary K$"]
    
    if salary_c_value <= 50 and salary_value != "<=50K":
        return "<=50K"
    elif salary_c_value > 50 and salary_value != ">50K":
        return ">50K"
    return salary_value

df["salary"] = df.apply(is_salary_correct, axis=1)
df

Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
0,0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K,39
1,1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K,35
2,2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K,27
3,3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K,43
4,4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,32556,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,38,United-States,<=50K,36
32557,32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K,173
32558,32558,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States,<=50K,40
32559,32559,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,20,United-States,<=50K,38


# Task 1
Print the count of men and women in the dataset.

In [6]:
gender_count = df['sex'].value_counts()
gender_count

sex
Male      20380
Female     9782
Name: count, dtype: int64

# Task 2
Find the average age of men in dataset

In [7]:
men_data = df[df['sex'] == 'Male']
average_age_men = men_data['age'].mean()
average_age_men

39.18400392541707

# Task 3
Get the percentage of people from Poland (native-country)

In [8]:
poland_data = df[df['native-country'] == 'Poland']
poland_count = len(poland_data)
total_count = len(df)
poland_percentage = (poland_count / total_count) * 100
poland_percentage

0.18566408063125786

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [9]:
high_income = df[df['salary'] == '>50K']
mean_age_high = high_income['age'].mean()
std_age_high = high_income['age'].std()
low_income = df[df['salary'] == '<=50K']
mean_age_low = low_income['age'].mean()
std_age_low = low_income['age'].std()
print(mean_age_high)
print(mean_age_low)

43.95911028236548
36.60806038668668


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [None]:
higher_education = ['Bachelors', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', 'Masters', 'Doctorate']
no_higher_education_high_income = df[(~df['education'].isin(higher_education)) & (df['salary'] == '>50K')]
no_higher_education_high_income

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [None]:
education_age_stats = df.groupby('education')['age'].describe()
education_age_stats

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [None]:
df['is_married'] = df['marital-status'].str.startswith('Married')
men_df = df[df['sex'] == 'Male']
married_men = men_df[men_df['is_married']]
unmarried_men = men_df[~men_df['is_married']]
married_men_salary = married_men['salary'].value_counts(normalize=True)
unmarried_men_salary = unmarried_men['salary'].value_counts(normalize=True)
print(married_men_salary)
print(unmarried_men_salary)

# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [None]:
max_hours = df['hours-per-week'].max()
max_hours_people = df[df['hours-per-week'] == max_hours]
num_max_hours_people = max_hours_people.shape[0]
num_max_hours_people

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [None]:
df_encoded = pd.get_dummies(df, drop_first=True)
correlation_matrix = df_encoded.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.show()