In [336]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [337]:
df = pd.read_csv("H:/projects/python advance/py-adult-data-analysis/data/adult.csv")
df = df[~df.isin(["?"]).any(axis=1)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 0 to 32560
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      30162 non-null  int64 
 1   age             30162 non-null  int64 
 2   workclass       30162 non-null  object
 3   education       30162 non-null  object
 4   marital-status  30162 non-null  object
 5   occupation      30162 non-null  object
 6   relationship    30162 non-null  object
 7   race            30162 non-null  object
 8   sex             30162 non-null  object
 9   hours-per-week  30162 non-null  int64 
 10  native-country  30162 non-null  object
 11  salary          30162 non-null  object
 12  salary K$       30162 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 3.2+ MB


In [338]:
valid_salaries = ["<=50K", ">50K"]
invalid_salary_rows = df[~df["salary"].isin(valid_salaries)]
df = df[~df.index.isin(invalid_salary_rows.index)]
condition_1 = (df["salary"] == "<=50K") & (df["salary K$"] <= 50)
condition_2 = (df["salary"] == ">50K") & (df["salary K$"] > 50)
df = df[condition_1 | condition_2]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 0 to 32560
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      30162 non-null  int64 
 1   age             30162 non-null  int64 
 2   workclass       30162 non-null  object
 3   education       30162 non-null  object
 4   marital-status  30162 non-null  object
 5   occupation      30162 non-null  object
 6   relationship    30162 non-null  object
 7   race            30162 non-null  object
 8   sex             30162 non-null  object
 9   hours-per-week  30162 non-null  int64 
 10  native-country  30162 non-null  object
 11  salary          30162 non-null  object
 12  salary K$       30162 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 3.2+ MB


# Task 1
Print the count of men and women in the dataset.

In [339]:
count = df["sex"].value_counts()
print("Count of Men: ", count["Male"])
print("Count of Female: ", count["Female"])

Count of Men:  20380
Count of Female:  9782


# Task 2
Find the average age of men in dataset

In [340]:
df_male = df[df["sex"] == "Male"]
men_average_age = df_male["age"].mean()
print(men_average_age)

39.18400392541707


# Task 3
Get the percentage of people from Poland (native-country)

In [341]:
polish_people = df[df["native-country"] == "Poland"]
polish_people_percentage = polish_people.shape[0] / df.shape[0] * 100
print("Polish people: ", f"{polish_people_percentage} %")

Polish people:  0.18566408063125786 %


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [342]:
low_mean_deviation = df[df["salary"] == "<=50K"]["age"].mean()
high_mean_deviation = df[df["salary"] == ">50K"]["age"].mean()

low_std_deviation = df[df["salary"] == "<=50K"]["age"].std()
high_std_deviation = df[df["salary"] == ">50K"]["age"].std()

print("Low salary - age mean: ", low_mean_deviation)
print("High salary - age mean: ", high_mean_deviation)
print("Low salary - age std: ", low_std_deviation)
print("High salary - age std: ", high_std_deviation)

Low salary - age mean:  36.60806038668668
High salary - age mean:  43.95911028236548
Low salary - age std:  13.464631257161633
High salary - age std:  10.269632835673852


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [343]:
df_higher_education = df[df["education"].isin(["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"])]
df_higher_education_high_salary = df_higher_education[df_higher_education["salary"] == ">50K"]
print("People without higher education, but with > 50K salary: ", df_higher_education_high_salary.shape[0])

People without higher education, but with > 50K salary:  4330


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [344]:
age_education = df.groupby("education")["age"].describe()
print(age_education)

               count       mean        std   min   25%   50%   75%   max
education                                                               
10th           820.0  37.897561  16.225795  17.0  23.0  36.0  52.0  90.0
11th          1048.0  32.363550  15.089307  17.0  18.0  28.5  43.0  90.0
12th           377.0  32.013263  14.373710  17.0  19.0  28.0  41.0  79.0
1st-4th        151.0  44.622517  14.929051  19.0  33.0  44.0  56.0  81.0
5th-6th        288.0  41.649306  14.754622  17.0  28.0  41.0  53.0  82.0
7th-8th        557.0  47.631957  15.737479  17.0  34.0  49.0  60.0  90.0
9th            455.0  40.303297  15.335754  17.0  28.0  38.0  53.0  90.0
Assoc-acdm    1008.0  37.286706  10.509755  19.0  29.0  36.0  44.0  90.0
Assoc-voc     1307.0  38.246366  11.181253  19.0  30.0  37.0  45.0  84.0
Bachelors     5044.0  38.641554  11.577566  19.0  29.0  37.0  46.0  90.0
Doctorate      375.0  47.130667  11.471727  24.0  39.0  47.0  54.0  80.0
HS-grad       9840.0  38.640955  13.067730  17.0  2

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [345]:
men = df[df["sex"] == "Male"]
married_average_salary = men[men["marital-status"].str.startswith("Married")]["salary K$"].mean()
non_married_average_salary = men[men["marital-status"].str.startswith("Married") == False]["salary K$"].mean()
print("Married men average salary: ",married_average_salary)
print("Non married men average salary: ", non_married_average_salary)
if non_married_average_salary > married_average_salary:
    print("Non married men earn more.")
else:
    print("Married men earn more.")

Married men average salary:  107.49455968688845
Non married men average salary:  46.59723865877712
Married men earn more.


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [346]:
max_hours_per_week = df["hours-per-week"].max()
print("Max hours per week: ", f"{max_hours_per_week} hours")
people_with_max_hours = df[df["hours-per-week"] == max_hours_per_week]
print("People with max hours per week: ", people_with_max_hours.shape[0])

Max hours per week:  99 hours
People with max hours per week:  78


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [347]:
df.corr()

Unnamed: 0.1,Unnamed: 0,age,hours-per-week,salary K$
Unnamed: 0,1.0,-0.001126,-0.00189,0.000129
age,-0.001126,1.0,0.101599,0.208203
hours-per-week,-0.00189,0.101599,1.0,0.196378
salary K$,0.000129,0.208203,0.196378,1.0


In [348]:
print("Age and salary K$ correlation: ", df["age"].corr(df["salary K$"]))
print("Age and hours per week correlation: ", df["age"].corr(df["hours-per-week"]))

Age and salary K$ correlation:  0.20820286434202276
Age and hours per week correlation:  0.10159875929549507
