In [1]:
import pandas as pd
import numpy as np

In [None]:
DATA_PATH = "hcvdat0.csv"   
CATEGORY_COL = "Category"

In [3]:
df = pd.read_csv(DATA_PATH)
print("Initial shape:", df.shape)

Initial shape: (615, 14)


In [28]:
# Remove first column 
if df.columns[0] not in ["Category", "Age", "Sex", "ALB", "ALP"]:
    df = df.drop(columns=df.columns[0])
print("After removing first column:", df.shape)

After removing first column: (615, 13)


In [29]:
 #Convert "0=Blood Donor", "1=Hepatitis", "2=Fibrosis", "3=Cirrhosis" â†’ 0,1,2,3
def clean_category(x):
    if isinstance(x, str) and "=" in x:
        return x.split("=")[0].strip()  # keep only number before '='
    return x

df[CATEGORY_COL] = df[CATEGORY_COL].apply(clean_category).astype("category")
print("Unique cleaned categories:", df[CATEGORY_COL].unique())

Unique cleaned categories: ['0', '0s', '1', '2', '3']
Categories (5, object): ['0', '0s', '1', '2', '3']


In [30]:
# Convert Sex to categorical 
df["Sex"] = df["Sex"].astype("category")

In [31]:
# Drop rows with category 0s or suspected blood donors
df = df[df[CATEGORY_COL] != "0s"].copy()
df[CATEGORY_COL] = df[CATEGORY_COL].cat.remove_unused_categories()
# Drop rows with missing values for biomarkers
df_clean = df.dropna().copy()
print("Unique categories after dropping '0s':", df[CATEGORY_COL].unique())
print("After dropping rows with any missing values:", df_clean.shape)

#Remove duplicates
before = df_clean.shape[0]
df_clean = df_clean.drop_duplicates()
after = df_clean.shape[0]
print(f"Duplicates removed: {before - after}")

Unique categories after dropping '0s': ['0', '1', '2', '3']
Categories (4, object): ['0', '1', '2', '3']
After dropping rows with any missing values: (582, 13)
Duplicates removed: 0


In [23]:
#Identify numeric and categorical columns
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df_clean.select_dtypes(include=['object','category']).columns.tolist()
print("Numeric columns:", numeric_cols)
print("Categorical columns:", cat_cols)

Numeric columns: ['Age', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']
Categorical columns: ['Category', 'Sex']


In [None]:
# Check if there are any missing data after cleaning
missing_count = df_clean.isnull().sum()
missing_pct = (df_clean.isnull().mean() * 100).round(2)
missing_df = pd.DataFrame({
    "missing_count": missing_count,
    "missing_pct": missing_pct
}).sort_values("missing_pct", ascending=False)

print("\n=== Missing values (cleaned) ===")
print(missing_df)



=== Missing values (cleaned) ===
          missing_count  missing_pct
Category              0          0.0
Age                   0          0.0
Sex                   0          0.0
ALB                   0          0.0
ALP                   0          0.0
ALT                   0          0.0
AST                   0          0.0
BIL                   0          0.0
CHE                   0          0.0
CHOL                  0          0.0
CREA                  0          0.0
GGT                   0          0.0
PROT                  0          0.0


In [None]:
#Export clean data (new csv file)
df_clean.to_csv("hcv_cleaned.csv", index=False)