In [2]:
#p1
import pandas as pd
# Step 1: Collect data from a CSV file
file_path = "N:\CS2225 DS\Datasets\p1.csv"
try:
    df = pd.read_csv(file_path)
    print("Original Data:\n", df.head())
except FileNotFoundError:
    print(f"Error: File '{file_path}' not found. Please check the file path.")
    exit(1)
except pd.errors.EmptyDataError:
    print(f"Error: The file '{file_path}' is empty or invalid.")
    exit(1)
except Exception as e:
    print(f"Error reading CSV file: {e}")
    exit(1)


# Step 2: Inspect data for issues
print("\nData Info:\n")
df.info()
print("\nMissing Values:\n", df.isnull().sum())
print("\nDuplicate Rows:", df.duplicated().sum())


# Step 3: Clean data
# Remove duplicates
df = df.drop_duplicates()
# Rename columns to match expected names
df = df.rename(columns={'Review Text': 'review_text', 'Rating': 'rating'})
# Check if required columns exist
required_columns = ['rating', 'review_text']
if not all(col in df.columns for col in required_columns):
    print(f"Error: Required columns {required_columns} not found. Available columns: {list(df.columns)}")
    exit(1)
# Handle missing values in 'rating' (numeric) with median
try:
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
    df['rating'] = df['rating'].fillna(df['rating'].median())
except Exception as e:
    print(f"Error handling 'rating' column: {e}")
    exit(1)
# Handle missing values in 'review_text' (text) with placeholder
df['review_text'] = df['review_text'].astype(str).fillna("No review provided")
# Standardize text: convert 'review_text' to lowercase
df['review_text'] = df['review_text'].str.lower()
# Remove invalid ratings (e.g., outside 1-5)
df = df[df['rating'].between(1, 5, inclusive='both')]


# Step 4: Verify cleaned data
print("\nCleaned Data:\n", df.head())
print("\nMissing Values After Cleaning:\n", df.isnull().sum())
print("\nDuplicate Rows After Cleaning:", df.duplicated().sum())


# Save cleaned data
try:
    df.to_csv("cleaned_reviews.csv", index=False)
    print("\nCleaned data saved to 'cleaned_reviews.csv'")
except Exception as e:
    print(f"Error saving cleaned data: {e}")


Original Data:
    Unnamed: 0  Clothing ID  Age                    Title  \
0           0          767   33                      NaN   
1           1         1080   34                      NaN   
2           2         1077   60  Some major design flaws   
3           3         1049   50         My favorite buy!   
4           4          847   47         Flattering shirt   

                                         Review Text  Rating  Recommended IND  \
0  Absolutely wonderful - silky and sexy and comf...       4                1   
1  Love this dress!  it's sooo pretty.  i happene...       5                1   
2  I had such high hopes for this dress and reall...       3                0   
3  I love, love, love this jumpsuit. it's fun, fl...       5                1   
4  This shirt is very flattering to all due to th...       5                1   

   Positive Feedback Count   Division Name Department Name Class Name  
0                        0       Initmates        Intimate  Inti

In [None]:
#ex1
import pandas as pd

file_path = "N:\CS2225 DS\Datasets\p1hc.csv"   
try:
    df = pd.read_csv(file_path)
    print("Original Data:\n", df.head())
except FileNotFoundError:
    print(f"Error: File '{file_path}' not found. Please check the file path.")
    exit(1)
except pd.errors.EmptyDataError:
    print(f"Error: The file '{file_path}' is empty or invalid.")
    exit(1)
except Exception as e:
    print(f"Error reading CSV file: {e}")
    exit(1)

print("\nData Info:\n")
df.info()
print("\nMissing Values:\n", df.isnull().sum())


print("\n--- Cleaning Data ---")

# Create a normalized name column to identify duplicates
df["Name_clean"] = (
    df["Name"].astype(str)
    .str.strip()
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)
)

# Remove duplicates based on Name_clean
duplicates = df.duplicated(subset=["Name_clean"]).sum()
print(f"\nDuplicate Rows before cleaning (by Name_clean): {duplicates}")
df = df.drop_duplicates(subset=["Name_clean"], keep="first")

# Handle missing Age: convert to numeric and fill with mean
try:
    df["Age"] = pd.to_numeric(df["Age"], errors="coerce")
    mean_age = df["Age"].mean()
    df["Age"] = df["Age"].fillna(mean_age)
except Exception as e:
    print(f"Error handling 'Age' column: {e}")
    exit(1)

# Handle missing Gender with "Unknown"
df["Gender"] = df["Gender"].fillna("Unknown")

# Standardize gender entries
def clean_gender(val):
    if isinstance(val, str):
        v = val.strip().lower()
        if v in ["m", "male"]:
            return "Male"
        elif v in ["f", "female"]:
            return "Female"
        elif v == "unknown":
            return "Unknown"
    return "Unknown"

df["Gender"] = df["Gender"].apply(clean_gender)

print("\nCleaned Data:\n", df.head())
print("\nMissing Values After Cleaning:\n", df.isnull().sum())
print("\nDuplicate Rows After Cleaning (by Name_clean):", df.duplicated(subset=["Name_clean"]).sum())


try:
    df.to_csv("cleaned_patients.csv", index=False)
    print("\nCleaned data saved to 'cleaned_patients.csv'")
except Exception as e:
    print(f"Error saving cleaned data: {e}")


Original Data:
             Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Number Admission Type Discharge Date   Medication  \
0    