# Data Quality Analysis on Adult Dataset
#### Using YData Profiling

# Note : Displaying the report at the end to avoid interrupting the notebook flow.

##### **Importing necessary lib and dataset**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import gc
from IPython.core.display import display, HTML
warnings.filterwarnings( "ignore",    message=".*should_run_async.*",category=DeprecationWarning)
warnings.filterwarnings("ignore")
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/profiling-report-html/profiling_report.html
/kaggle/input/adult/adult.data
/kaggle/input/adult/adult.names
/kaggle/input/adult/Index
/kaggle/input/adult/old.adult.names
/kaggle/input/adult/adult.test


In [2]:
from ydata_profiling import ProfileReport

## Data loading and basic data cleaning like data type changing 

In [3]:
column_names = [
    'age', 'workclass', "fnlwgt",'education', 'education-num', 'marital-status', 
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 
    'capital-loss', 'hours-per-week', 'native-country', 'income'
]

adult_data = pd.read_csv("/kaggle/input/adult/adult.data", names=column_names)
adult_test = pd.read_csv("/kaggle/input/adult/adult.test", names=column_names)
adult_test.drop(index=0,inplace=True)

adult_data_full = pd.concat([adult_data, adult_test], axis=0)

adult_data_full.index = adult_data_full.index.astype(str)

int_columns = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
for col in int_columns:
    adult_data_full[col] = adult_data_full[col].astype(int)

In [4]:
adult_data_full.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
14224,51,Private,279337,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,48,United-States,<=50K.
20135,38,Private,217349,Assoc-voc,11,Divorced,Prof-specialty,Not-in-family,White,Female,14344,0,40,United-States,>50K
9939,17,Private,184025,11th,7,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,16,United-States,<=50K.
7064,31,Private,108322,Some-college,10,Married-AF-spouse,Craft-repair,Husband,White,Male,0,0,28,United-States,<=50K.
25386,26,Private,98155,HS-grad,9,Married-AF-spouse,Sales,Husband,White,Male,0,0,55,United-States,<=50K


In [5]:
adult_data_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48842 entries, 0 to 16281
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       48842 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      48842 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48842 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 6.0+ MB


In [6]:
string_columns = adult_data_full.select_dtypes(include=['object']).columns

for col in string_columns:
    adult_data[col] = adult_data[col].str.strip()

In [7]:
adult_data_full.shape

(48842, 15)

In [8]:
profile_adult_data = ProfileReport(adult_data_full, title="Profiling Report")

## Data Profiling Using YData Profiling

In [9]:
# profile_adult_data

In [10]:
profile_adult_data.to_file("profiling_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Reading it after saving due to some time notebook get freeze

In [11]:
# with open("/kaggle/working/profiling_report.html", "r", encoding="utf-8") as f:
#     report = f.read()

# display(HTML(report))

**column-wise summary** based on Y Data profiling report. 
---

### 🧾 **Column-Wise Summary**

| Column Name       | Data Type | Missing (%) | Unique Values | Notes / Issues | Possible Insights |
|-------------------|-----------|-------------|----------------|----------------|----------------|
| **age**           | float64   | 0%       | 74       |NO  | Can be Fillid with mean/median if needed |
| **sex**        | object    | 0%          | 2              | Categorical (Male/Female) | Encode using LabelEncoder or one-hot |
| **workclass**          | object    | 0%      | 9         | dominated by private also question mark(?) is there | ? can be replaced from NA |
| **education**        | object   | 0%         | 16     | Dominated by HS-grad |  |
| **education-num**         | int64     | 0%          | 16  | Good distribution peak at 9 |  |
| **marital-status**   | object    | 0%          | 7   | Looks good|  |
| **occupation**     | Categorical      | 0%          | 15  | ? is there as Catagory | |
| **relationship**      | relationship    | 0%        | 6 || |
| **race**      | Categorical    | 0%          | 6     | White Dominated     |  |
| **capital-gain** | float64   | 0%        | 123         | 44807 zeroes are there| Impute or consider dropping if not informative |
| **capital-loss**        | float64    | 0%      |99  | 46560 zeros | Fill  or drop if not relevant |
| **hours-per-week**  | float64       | 0%          | 96            | distribution between 1 to 99 |  |
| **native-country**  | Categorical       | 0%          | 42            | Dominated by US |  |
| **income**  | Categorical       | 0%          | 4            | looks like some typing mistake should only be 2 catagires but here is <=50K	<=50K.	>50K	>50K.| Make them 2 |

---



# Improving the Data quality

In [12]:
import pandas as pd
import numpy as np

In [13]:
# 1. Normalize income labels (fix variations like '>50K.', '<=50K ', etc.)
adult_data_full['income'] = adult_data_full['income'].str.strip().replace({'>50K.': '>50K', '<=50K.': '<=50K'})

# 2. Replace '?' with NaN
adult_data_full.replace('?', np.nan, inplace=True)

# 3. Remove duplicate rows
adult_data_full = adult_data_full.drop_duplicates()

In [14]:
adult_data_full.shape

(48790, 15)

In [15]:
adult_data_full['income'].unique()

# Count of duplicate rows
num_duplicates = adult_data_full.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")

# Number of NaN (missing) values per column
nan_counts = adult_data_full.isna().sum()
print("\nNumber of NaN values per column:")
print(nan_counts)

Number of duplicate rows: 0

Number of NaN values per column:
age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


# Showing the report and the end due to the report is taking part in the notebook

In [16]:
profile_adult_data

