In [3]:
import pandas as pd
import pd as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [4]:
data = pd.read_csv("/Users/artemkazakov/PycharmProjects/py-adult-data-analysis/data/adult.csv")
invalid_value = data.isin(["?"]).any(axis=1)
data = data[~invalid_value]
data["salary"] = data["salary K$"].apply(lambda x: "<=50K" if x <= 50 else ">50K")

# Task 1
Print the count of men and women in the dataset.

In [5]:
data["sex"].value_counts()

Male      20380
Female     9782
Name: sex, dtype: int64

# Task 2
Find the average age of men in dataset

In [34]:
mean_age = pd.Series(data["age"].mean(), index=["average age of men"])
mean_age

average age of men    38.437902
dtype: float64

# Task 3
Get the percentage of people from Poland (native-country)

In [30]:
percentage = (data["native-country"].value_counts(normalize=True)) * 100
poland_percentage = pd.Series(percentage.loc["Poland"], index=["percentage of people from Poland"])
poland_percentage

percentage of people from Poland    0.185664
dtype: float64

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [38]:
rich_data = data[data["salary"] == ">50K"]
poor_data = data[data["salary"] == "<=50K"]
df = pd.DataFrame({"mean_age": [rich_data["age"].mean(), poor_data["age"].mean()], "std_age": [rich_data["age"].std(), poor_data["age"].std()]}, index=["> 50K", "<=50K"])
df

Unnamed: 0,mean_age,std_age
> 50K,43.95911,10.269633
<=50K,36.60806,13.464631


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [9]:
without_higher_education = data[~data.education.isin(["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"])]
without_higher_education[without_higher_education["salary"] == ">50K"]

Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
7,7,52,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,>50K,307
10,10,37,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,Black,Male,80,United-States,>50K,116
55,55,43,Private,Some-college,Married-civ-spouse,Tech-support,Husband,White,Male,40,United-States,>50K,341
67,67,53,Private,HS-grad,Married-civ-spouse,Adm-clerical,Wife,White,Female,40,United-States,>50K,225
68,68,49,Self-emp-inc,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,50,United-States,>50K,194
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32462,32462,48,Self-emp-inc,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,50,United-States,>50K,343
32518,32518,57,Local-gov,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,40,United-States,>50K,116
32519,32519,46,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,Male,48,United-States,>50K,239
32557,32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K,173


# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [10]:
data.groupby("education")["age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10th,820.0,37.897561,16.225795,17.0,23.0,36.0,52.0,90.0
11th,1048.0,32.36355,15.089307,17.0,18.0,28.5,43.0,90.0
12th,377.0,32.013263,14.37371,17.0,19.0,28.0,41.0,79.0
1st-4th,151.0,44.622517,14.929051,19.0,33.0,44.0,56.0,81.0
5th-6th,288.0,41.649306,14.754622,17.0,28.0,41.0,53.0,82.0
7th-8th,557.0,47.631957,15.737479,17.0,34.0,49.0,60.0,90.0
9th,455.0,40.303297,15.335754,17.0,28.0,38.0,53.0,90.0
Assoc-acdm,1008.0,37.286706,10.509755,19.0,29.0,36.0,44.0,90.0
Assoc-voc,1307.0,38.246366,11.181253,19.0,30.0,37.0,45.0,84.0
Bachelors,5044.0,38.641554,11.577566,19.0,29.0,37.0,46.0,90.0


# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [58]:
married = data[(data["sex"] == "Male") & (data["marital-status"].str.startswith("Married"))]
not_married = data[(data["sex"] == "Male") & (~data["marital-status"].str.startswith("Married"))]
married_rich_ratio = married[married["salary"] == ">50K"].count() / married["salary"].count()
not_married_rich_ratio = not_married[not_married["salary"] == ">50K"].count() / not_married["salary"].count()

d = {"the part of the rich men in their group": [married_rich_ratio.iloc[1], not_married_rich_ratio.iloc[1]]}
df = pd.DataFrame(d, index=["married_men", "not_married_men"])
df

Unnamed: 0,the part of the rich men in their group
married_men,0.447984
not_married_men,0.088494


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [78]:
max_hours_per_week = data["hours-per-week"].max()
pd.Series([max_hours_per_week], index=["max hours per week some person works"])

max hours per week some person works    99
dtype: int64

In [79]:
hours_indx = data["hours-per-week"].value_counts().index
hours_count = data["hours-per-week"].value_counts().values
df = pd.DataFrame({"hours": hours_indx, "amount": hours_count})
df

Unnamed: 0,hours,amount
0,40,14251
1,50,2718
2,45,1753
3,60,1405
4,35,1184
...,...,...
89,82,1
90,94,1
91,92,1
92,87,1


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [6]:
data.corr()

Unnamed: 0.1,Unnamed: 0,age,hours-per-week,salary K$
Unnamed: 0,1.0,-0.001126,-0.00189,0.000129
age,-0.001126,1.0,0.101599,0.208203
hours-per-week,-0.00189,0.101599,1.0,0.196378
salary K$,0.000129,0.208203,0.196378,1.0
