In [2]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [6]:
df = pd.read_csv("../data/adult.csv")
df = df[df != "?"].dropna()
df = df[(df["salary"] == ">50K") | (df["salary"] == "<=50K")]
df["salary K$"] = df["salary K$"].astype(float)
df = df[(df["salary K$"] >= 0.0) & (df["salary K$"] <= 999.0)]
print(df.dtypes)
print(df.head)

Unnamed: 0          int64
age                 int64
workclass          object
education          object
marital-status     object
occupation         object
relationship       object
race               object
sex                object
hours-per-week      int64
native-country     object
salary             object
salary K$         float64
dtype: object
<bound method NDFrame.head of        Unnamed: 0  age         workclass   education      marital-status   
0               0   39         State-gov   Bachelors       Never-married  \
1               1   50  Self-emp-not-inc   Bachelors  Married-civ-spouse   
2               2   38           Private     HS-grad            Divorced   
3               3   53           Private        11th  Married-civ-spouse   
4               4   28           Private   Bachelors  Married-civ-spouse   
...           ...  ...               ...         ...                 ...   
32556       32556   27           Private  Assoc-acdm  Married-civ-spouse   
32557     

# Task 1
Print the count of men and women in the dataset.

In [7]:
sex_counts = df["sex"].value_counts()
print("Number of men: ", sex_counts["Male"])
print("Number of women: ", sex_counts["Female"])

Number of men:  20380
Number of women:  9782


# Task 2
Find the average age of men in dataset

In [8]:
avg_age_men = df.loc[df['sex'] == 'Male', 'age'].mean()

print("Average age of men in the dataset: ", avg_age_men)

Average age of men in the dataset:  39.18400392541707


# Task 3
Get the percentage of people from Poland (native-country)

In [9]:
country_counts = df['native-country'].value_counts()

percent_from_poland = 100 * country_counts['Poland'] / country_counts.sum()

print("Percentage of people from Poland: ", percent_from_poland)

Percentage of people from Poland:  0.18566408063125786


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [10]:
more_50 = df.loc[df['salary'] == '>50K', 'age'].agg(['mean', 'std'])

less_50 = df.loc[df['salary'] == '<=50K', 'age'].agg(['mean', 'std'])

print("Age statistics for people who earn >50K per year: ")
print("Mean: ", more_50['mean'])
print("Standard deviation: ", more_50['std'])
print("Age statistics for people who earn <=50K per year: ")
print("Mean: ", less_50['mean'])
print("Standard deviation: ", less_50['std'])

Age statistics for people who earn >50K per year: 
Mean:  43.95911028236548
Standard deviation:  10.269632835673852
Age statistics for people who earn <=50K per year: 
Mean:  36.60806038668668
Standard deviation:  13.464631257161633


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [11]:
no_higher_ed = df.loc[(df['salary'] == '>50K') & (~df['education'].isin(['Bachelors', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', 'Masters', 'Doctorate'])), :]


if no_higher_ed.empty:
    print("There are no people with >50K salary and no higher education.")
else:
    print("There are people with >50K salary and no higher education.")
    print(no_higher_ed)

There are people with >50K salary and no higher education.
       Unnamed: 0  age         workclass     education      marital-status   
7               7   52  Self-emp-not-inc       HS-grad  Married-civ-spouse  \
10             10   37           Private  Some-college  Married-civ-spouse   
55             55   43           Private  Some-college  Married-civ-spouse   
67             67   53           Private       HS-grad  Married-civ-spouse   
68             68   49      Self-emp-inc  Some-college  Married-civ-spouse   
...           ...  ...               ...           ...                 ...   
32462       32462   48      Self-emp-inc       HS-grad  Married-civ-spouse   
32518       32518   57         Local-gov       HS-grad  Married-civ-spouse   
32519       32519   46           Private  Some-college  Married-civ-spouse   
32557       32557   40           Private       HS-grad  Married-civ-spouse   
32560       32560   52      Self-emp-inc       HS-grad  Married-civ-spouse   

    

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [12]:
ed_age_stats = df.groupby('education')['age'].describe()

print(ed_age_stats)

               count       mean        std   min   25%   50%   75%   max
education                                                               
10th           820.0  37.897561  16.225795  17.0  23.0  36.0  52.0  90.0
11th          1048.0  32.363550  15.089307  17.0  18.0  28.5  43.0  90.0
12th           377.0  32.013263  14.373710  17.0  19.0  28.0  41.0  79.0
1st-4th        151.0  44.622517  14.929051  19.0  33.0  44.0  56.0  81.0
5th-6th        288.0  41.649306  14.754622  17.0  28.0  41.0  53.0  82.0
7th-8th        557.0  47.631957  15.737479  17.0  34.0  49.0  60.0  90.0
9th            455.0  40.303297  15.335754  17.0  28.0  38.0  53.0  90.0
Assoc-acdm    1008.0  37.286706  10.509755  19.0  29.0  36.0  44.0  90.0
Assoc-voc     1307.0  38.246366  11.181253  19.0  30.0  37.0  45.0  84.0
Bachelors     5044.0  38.641554  11.577566  19.0  29.0  37.0  46.0  90.0
Doctorate      375.0  47.130667  11.471727  24.0  39.0  47.0  54.0  80.0
HS-grad       9840.0  38.640955  13.067730  17.0  2

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [16]:
male_df = df[df['sex'] == 'Male']

male_df.loc[male_df['marital-status'].str.startswith('Married'), 'marital_status_group'] = 'Married'
male_df.loc[male_df['marital-status'].str.startswith('Married'), 'marital_status_group'] = 'Not Married'

salary_counts = male_df.groupby(['marital_status_group', 'salary'])['salary'].count()

print(salary_counts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  male_df.loc[male_df['marital-status'].str.startswith('Married'), 'marital_status_group'] = 'Married'


marital_status_group  salary
Not Married           <=50K     7052
                      >50K      5723
Name: salary, dtype: int64


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [17]:
max_hours = df['hours-per-week'].max()

num_people = df[df['hours-per-week'] == max_hours]['hours-per-week'].count()

print('Max hours per week:', max_hours)
print('Number of people who work this amount of hours per week:', num_people)

Max hours per week: 99
Number of people who work this amount of hours per week: 78


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [18]:
df_cor = df.select_dtypes(include=["int", "float"])
df_cor.iloc[:, 1:].corr()

Unnamed: 0,age,hours-per-week,salary K$
age,1.0,0.101599,0.208203
hours-per-week,0.101599,1.0,0.196378
salary K$,0.208203,0.196378,1.0
