In [6]:
import pandas as pd

In [7]:
adults_data = pd.read_csv("adult.csv")

In [8]:
adults_data.head()

Unnamed: 0.1,Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary,salary K$
0,0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K,39
1,1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K,35
2,2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K,27
3,3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K,43
4,4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K,25


In [9]:
adults_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      32561 non-null  int64 
 1   age             32561 non-null  int64 
 2   workclass       32561 non-null  object
 3   education       32561 non-null  object
 4   marital-status  32561 non-null  object
 5   occupation      32561 non-null  object
 6   relationship    32561 non-null  object
 7   race            32561 non-null  object
 8   sex             32561 non-null  object
 9   hours-per-week  32561 non-null  int64 
 10  native-country  32561 non-null  object
 11  salary          32561 non-null  object
 12  salary K$       32561 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 3.2+ MB


## Data preparation

In [10]:
adults_data.drop(columns=["Unnamed: 0"], inplace=True)

#### Remove rows with `?` value in any column. The number of rows reduced from **32561** to **30162**.

In [11]:
adults_data.astype(str).apply(lambda row: row.str.contains("\\?")).any(
    axis=1
).sum()

np.int64(2399)

In [12]:
mask_queation_mark = (
    adults_data.astype(str)
    .apply(lambda row: row.str.contains("\\?"))
    .any(axis=1)
)

In [13]:
adults_data = adults_data[~mask_queation_mark]

In [14]:
adults_data.shape

(30162, 12)

In [15]:
adults_data.astype(str).apply(lambda row: row.str.contains("\\?")).any(
    axis=1
).any().sum()

np.int64(0)

#### Columns with salary info preparation

In [16]:
adults_data.salary.isnull().sum()

np.int64(0)

In [17]:
adults_data.salary.value_counts()

salary
<=50K    22654
>50K      7508
Name: count, dtype: int64

* more suitable name set for `salary K$` table

In [18]:
adults_data.rename(columns={"salary K$": "salary_USD_k"}, inplace=True)

In [19]:
adults_data.columns.isin(["salary_USD_k"]).any()

np.True_

* implementation of **more_than_50k** with `boolean dtype` instead of dropped **salary** column

In [20]:
adults_data["more_than_50k"] = adults_data.salary.apply(
    lambda x: False if str(x).startswith("<=") else True
)

In [21]:
adults_data.more_than_50k.value_counts()

more_than_50k
False    22654
True      7508
Name: count, dtype: int64

In [22]:
adults_data.drop(columns=["salary"], inplace=True)

In [23]:
adults_data.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary_USD_k,more_than_50k
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,39,False
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,35,False
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,27,False
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,43,False
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,25,False


* reset index after dropping rows

In [24]:
adults_data.reset_index(drop=True, inplace=True)

In [25]:
adults_data.index

RangeIndex(start=0, stop=30162, step=1)

* validating data in `salary_USD_k` and `more_than_50k` columns

In [26]:
adults_data[
    ((adults_data.more_than_50k == True) & (adults_data.salary_USD_k <= 50))
    | ((adults_data.more_than_50k == False) & (adults_data.salary_USD_k > 50))
]

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary_USD_k,more_than_50k


* implementing **categorial** columns

In [27]:
adults_data.workclass = adults_data.workclass.astype("category")
adults_data.relationship = adults_data.relationship.astype("category")
adults_data.sex = adults_data.sex.astype("category")
adults_data.race = adults_data.race.astype("category")
adults_data.education = adults_data.education.astype("category")
adults_data.occupation = adults_data.occupation.astype("category")
adults_data["marital-status"] = adults_data["marital-status"].astype(
    "category"
)

In [28]:
adults_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30162 entries, 0 to 30161
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             30162 non-null  int64   
 1   workclass       30162 non-null  category
 2   education       30162 non-null  category
 3   marital-status  30162 non-null  category
 4   occupation      30162 non-null  category
 5   relationship    30162 non-null  category
 6   race            30162 non-null  category
 7   sex             30162 non-null  category
 8   hours-per-week  30162 non-null  int64   
 9   native-country  30162 non-null  object  
 10  salary_USD_k    30162 non-null  int64   
 11  more_than_50k   30162 non-null  bool    
dtypes: bool(1), category(7), int64(3), object(1)
memory usage: 1.2+ MB


## Analysis

In [32]:
pd.pivot_table(index='more_than_50k', values='age', data=adults_data, aggfunc=['std', 'mean']).rename(columns={'std': 'std_age', 'mean': 'mean_age', 'age': ''})

Unnamed: 0_level_0,std_age,mean_age
more_than_50k,Unnamed: 1_level_1,Unnamed: 2_level_1
False,13.464631,36.60806
True,10.269633,43.95911


📊 **Text Analysis of the Correlation Matrix:**

### 💰 **more_than_50k (earns >50K)**

- **Salary (salary_USD_k) ↔ more_than_50k** → **0.85**  
  Strong positive correlation (which is logical since the variable was created based on it).  
  People with higher salaries almost always have `more_than_50k = True`.

- **Age (age) ↔ more_than_50k** → **0.24**  
  Weak positive correlation.  
  Older people tend to earn more than 50K, but it's not a guarantee.

- **Hours worked (hours-per-week) ↔ more_than_50k** → **0.23**  
  Also a weak positive correlation.  
  Those who work more hours are more likely to have a higher salary.

### 💼 **Other relationships:**

- **Age ↔ Salary** → **0.21**  
  Slightly older individuals earn slightly more.

- **Hours ↔ Salary** → **0.19**  
  People who work more hours earn slightly more.

- **Age ↔ Hours** → **0.10**  
  Almost no correlation — age has little impact on the number of hours worked.

### 🧠 **Conclusions:**

- **The strongest relationship** is between salary in $ and the `more_than_50k` feature, which confirms the validity of the variable.

- Other relationships are weak but logical:
  - Older individuals and those working more hours have a higher chance of earning over 50K, but not always.
  - There is no strong link between age and hours worked, meaning younger people can work as many hours as older individuals.

In [62]:
adults_data[["age", "hours-per-week", "salary_USD_k", 'more_than_50k']].corr()

Unnamed: 0,age,hours-per-week,salary_USD_k,more_than_50k
age,1.0,0.101599,0.208203,0.241998
hours-per-week,0.101599,1.0,0.196378,0.22948
salary_USD_k,0.208203,0.196378,1.0,0.853894
more_than_50k,0.241998,0.22948,0.853894,1.0
