In [2]:
import numpy as np
import pandas as pd
data = pd.read_csv("../data/adult.csv")

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [3]:
excluded_data = data[data.apply(lambda x: "?" in x.values, axis=1)]
cleaned_data = data.drop(excluded_data.index)

cleaned_data = cleaned_data[(cleaned_data["salary"] == ">50K") | (cleaned_data["salary"] == "<=50K")]
cleaned_data["salary K$"] = pd.to_numeric(cleaned_data['salary K$'], errors='coerce')

cleaned_data.to_csv("../data/adults_cl.csv", index=False)
excluded_data.to_csv("../data/adults_ex.csv", index=False)

print("Both files (adults_cl.csv and adults_ex.csv) created successfully.")
print("Number of records in adults_cl.csv:", cleaned_data.shape[0])
print("Number of records in adults_ex.csv:", excluded_data.shape[0])

Both files (adults_cl.csv and adults_ex.csv) created successfully.
Number of records in adults_cl.csv: 30162
Number of records in adults_ex.csv: 2399


# Task 1
Print the count of men and women in the dataset.

In [4]:
gender_counts = data["sex"].value_counts()
gender_counts

sex
Male      21790
Female    10771
Name: count, dtype: int64

# Task 2
Find the average age of men in dataset

In [5]:
men_data = data[data["sex"] == "Male"]
average_age_men = men_data["age"].mean()

print("Average age of men:", average_age_men)

Average age of men: 39.43354749885268


# Task 3
Get the percentage of people from Poland (native-country)

In [6]:
poland_count = data[data["native-country"] == "Poland"].shape[0]
total_count = data.shape[0]

percentage_poland = (poland_count / total_count) * 100

print("Percentage of people from Poland: {:.2f}%".format(percentage_poland))

Percentage of people from Poland: 0.18%


# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [7]:
high_income_data = data[data["salary"] == ">50K"]
mean_age_high_income = high_income_data["age"].mean()
std_age_high_income = high_income_data["age"].std()

low_income_data = data[data["salary"] == "<=50K"]
mean_age_low_income = low_income_data["age"].mean()
std_age_low_income = low_income_data["age"].std()

print("Mean age for people earning > 50K:", mean_age_high_income)
print("Standard deviation of age for people earning > 50K:", std_age_high_income)
print("Mean age for people earning <= 50K:", mean_age_low_income)
print("Standard deviation of age for people earning <= 50K:", std_age_low_income)

Mean age for people earning > 50K: 44.24984058155847
Standard deviation of age for people earning > 50K: 10.519027719851826
Mean age for people earning <= 50K: 36.78373786407767
Standard deviation of age for people earning <= 50K: 14.02008849082488


# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [8]:
higher_education = ["Bachelors", "Prof-school", "Assoc-acdm", "Assoc-voc", "Masters", "Doctorate"]

high_income_no_higher_edu = data[(data["salary"] == ">50K") & (~data["education"].isin(higher_education))]

if high_income_no_higher_edu.empty:
    print("There are no people with > 50K salary and without higher education.")
else:
    print("There are people with > 50K salary and without higher education:")
    print(high_income_no_higher_edu)

There are people with > 50K salary and without higher education:
       Unnamed: 0  age         workclass     education      marital-status  \
7               7   52  Self-emp-not-inc       HS-grad  Married-civ-spouse   
10             10   37           Private  Some-college  Married-civ-spouse   
27             27   54                 ?  Some-college  Married-civ-spouse   
38             38   31           Private  Some-college  Married-civ-spouse   
55             55   43           Private  Some-college  Married-civ-spouse   
...           ...  ...               ...           ...                 ...   
32510       32510   39           Private       HS-grad  Married-civ-spouse   
32518       32518   57         Local-gov       HS-grad  Married-civ-spouse   
32519       32519   46           Private  Some-college  Married-civ-spouse   
32557       32557   40           Private       HS-grad  Married-civ-spouse   
32560       32560   52      Self-emp-inc       HS-grad  Married-civ-spouse   

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [9]:
education_age_stats = data.groupby("education")["age"].describe()

print(education_age_stats)

                count       mean        std   min    25%   50%   75%   max
education                                                                 
10th            933.0  37.429796  16.720713  17.0  22.00  34.0  52.0  90.0
11th           1175.0  32.355745  15.545485  17.0  18.00  28.0  43.0  90.0
12th            433.0  32.000000  14.334625  17.0  19.00  28.0  41.0  79.0
1st-4th         168.0  46.142857  15.615625  19.0  33.00  46.0  57.0  90.0
5th-6th         333.0  42.885886  15.557285  17.0  29.00  42.0  54.0  84.0
7th-8th         646.0  48.445820  16.092350  17.0  34.25  50.0  61.0  90.0
9th             514.0  41.060311  15.946862  17.0  28.00  39.0  54.0  90.0
Assoc-acdm     1067.0  37.381443  11.095177  19.0  29.00  36.0  44.0  90.0
Assoc-voc      1382.0  38.553546  11.631300  19.0  30.00  37.0  46.0  84.0
Bachelors      5355.0  38.904949  11.912210  19.0  29.00  37.0  46.0  90.0
Doctorate       413.0  47.702179  11.784716  24.0  39.00  47.0  55.0  80.0
HS-grad       10501.0  38

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [10]:
married_men = data[(data["sex"] == "Male") & (data["marital-status"].str.startswith("Married"))]
non_married_men = data[(data["sex"] == "Male") & (~data["marital-status"].str.startswith("Married"))]

married_high_income_count = married_men[married_men["salary"] == ">50K"].shape[0]
non_married_high_income_count = non_married_men[non_married_men["salary"] == ">50K"].shape[0]

if married_high_income_count > non_married_high_income_count:
    print("Married men earn more.")
elif married_high_income_count < non_married_high_income_count:
    print("Non-married men earn more.")
else:
    print("Married and non-married men earn the same.")

Married men earn more.


# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [11]:
max_hours = data["hours-per-week"].max()

people_same_hours = data[data["hours-per-week"] == max_hours]
num_people_same_hours = people_same_hours.shape[0]

print("Maximum hours per week:", max_hours)
print("Number of people working the same amount of hours per week:", num_people_same_hours)

Maximum hours per week: 99
Number of people working the same amount of hours per week: 85


# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [12]:
numerical_columns = data.select_dtypes(include="number")
correlation_matrix = numerical_columns.corr()

print("Correlation Matrix:")
print(correlation_matrix)

correlation_threshold = 0.5
strong_connections = (correlation_matrix.abs() >= correlation_threshold) & (correlation_matrix.abs() < 1)

print("\nHighlighted Connections:")
for column in strong_connections:
    correlated_columns = strong_connections[strong_connections[column]].index.tolist()
    if len(correlated_columns) > 1:
        print(f"{column}: {', '.join(correlated_columns)}")

Correlation Matrix:
                Unnamed: 0       age  hours-per-week  salary K$
Unnamed: 0        1.000000  0.001286        0.000607  -0.001666
age               0.001286  1.000000        0.068756   0.201774
hours-per-week    0.000607  0.068756        1.000000   0.196916
salary K$        -0.001666  0.201774        0.196916   1.000000

Highlighted Connections:
