In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/Users/akshay/Desktop/dsbda_practical/newdata/hepatitis_csv.csv", header=None)

# Assign column names based on dataset info
df.columns = [
    "Class", "AGE", "SEX", "STEROID", "ANTIVIRALS", "FATIGUE", "MALAISE",
    "ANOREXIA", "LIVER_BIG", "LIVER_FIRM", "SPLEEN_PALPABLE", "SPIDERS",
    "ASCITES", "VARICES", "BILIRUBIN", "ALK_PHOSPHATE", "SGOT", "ALBUMIN",
    "PROTIME", "HISTOLOGY"
]

# Replace missing values indicated by "?" with NaN
df.replace("?", pd.NA, inplace=True)

# Convert columns to appropriate data types
df = df.apply(pd.to_numeric, errors='ignore')

# a. Create data subsets for different sex
male_df = df[df['SEX'] == 1]
female_df = df[df['SEX'] == 2]

# b. Merge two subsets (concatenate back)
merged_df = pd.concat([male_df, female_df])

# c. Sort data by AGE, SGOT, PROTIME
sorted_df = df.sort_values(by=["AGE", "SGOT", "PROTIME"], ascending=[True, True, True])

# d. Transposing data
transposed_df = df.transpose()

# e. Melting Data (long format)
melted_df = pd.melt(df, id_vars=["Class", "AGE", "SEX"], 
                    value_vars=["SGOT", "PROTIME", "ALBUMIN"],
                    var_name="Measurement", value_name="Value")

# f. Casting data back to wide format
wide_df = melted_df.pivot_table(index=["Class", "AGE", "SEX"], 
                                columns="Measurement", values="Value", aggfunc='first').reset_index()

# Output previews
print("Subset - Male:\n", male_df.head())
print("\nMerged Subset:\n", merged_df.head())
print("\nSorted Data:\n", sorted_df.head())
print("\nTransposed Data:\n", transposed_df.head())
print("\nMelted Data:\n", melted_df.head())
print("\nWide Format (Pivoted Data):\n", wide_df.head())

Subset - Male:
 Empty DataFrame
Columns: [Class, AGE, SEX, STEROID, ANTIVIRALS, FATIGUE, MALAISE, ANOREXIA, LIVER_BIG, LIVER_FIRM, SPLEEN_PALPABLE, SPIDERS, ASCITES, VARICES, BILIRUBIN, ALK_PHOSPHATE, SGOT, ALBUMIN, PROTIME, HISTOLOGY]
Index: []

Merged Subset:
 Empty DataFrame
Columns: [Class, AGE, SEX, STEROID, ANTIVIRALS, FATIGUE, MALAISE, ANOREXIA, LIVER_BIG, LIVER_FIRM, SPLEEN_PALPABLE, SPIDERS, ASCITES, VARICES, BILIRUBIN, ALK_PHOSPHATE, SGOT, ALBUMIN, PROTIME, HISTOLOGY]
Index: []

Sorted Data:
     Class     AGE    SEX STEROID ANTIVIRALS FATIGUE MALAISE ANOREXIA  \
105    47  female   True   False       True    True   False     True   
132    31  female  False   False       True    True    True     True   
128    50  female   True   False       True   False   False     True   
145    45  female   True   False       True    True    True     True   
68     57  female   True   False       True    True    True     True   

    LIVER_BIG LIVER_FIRM SPLEEN_PALPABLE SPIDERS ASCITES VA

  df = df.apply(pd.to_numeric, errors='ignore')
