In [None]:
#p2
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Step 1: Load dataset
file_path = "N:\CS2225 DS\Datasets\p2.csv"
try:
    df = pd.read_csv(file_path, encoding='latin1')  
    print("Original Data:\n", df.head())
except FileNotFoundError:
    print(f"Error: File '{file_path}' not found. Please check the file path.")
    exit(1)
except pd.errors.EmptyDataError:
    print(f"Error: The file '{file_path}' is empty or invalid.")
    exit(1)
except Exception as e:
    print(f"Error reading CSV file: {e}")
    exit(1)


# Step 2: Check if required columns exist
required_columns = ['ORDERDATE', 'TERRITORY', 'SALES']
if not all(col in df.columns for col in required_columns):
    print(f"Error: Required columns {required_columns} not found. Available columns: {list(df.columns)}")
    exit(1)


# Step 3: Rename columns to match expected names
df = df.rename(columns={'ORDERDATE': 'order_date', 'TERRITORY': 'region', 'SALES': 'order_amount'})


# Step 4: Data Wrangling
# Handle missing values
df['order_date'] = df['order_date'].fillna('2023-01-01')  # Fill missing dates
df['region'] = df['region'].astype(str).fillna('Unknown')  # Fill missing regions
try:
    df['order_amount'] = pd.to_numeric(df['order_amount'], errors='coerce')
    df['order_amount'] = df['order_amount'].fillna(df['order_amount'].mean())
except Exception as e:
    print(f"Error handling 'order_amount' column: {e}")
    exit(1)

# Standardize date format
try:
    df['order_date'] = pd.to_datetime(df['order_date'], errors='coerce', format='%Y-%m-%d')
except Exception as e:
    print(f"Error converting 'order_date' to datetime: {e}")
    exit(1)

# Standardize categorical variables (region to title case)
df['region'] = df['region'].str.title()

# Encode categorical variable (region) to numerical
unique_regions = df['region'].unique()
region_mapping = {region: idx for idx, region in enumerate(unique_regions, start=0)}
region_mapping['Unknown'] = 0  # Ensure Unknown maps to 0
df['region_encoded'] = df['region'].map(region_mapping)
# Normalize numerical data (order_amount)
try:
    scaler = MinMaxScaler()
    df['order_amount_normalized'] = scaler.fit_transform(df[['order_amount']].fillna(df['order_amount'].mean()))
except Exception as e:
    print(f"Error normalizing 'order_amount': {e}")
    exit(1)


# Step 5: Display transformed data
print("\nTransformed Data:\n", df.head())
print("\nMissing Values:\n", df.isnull().sum())
print("\nRegion Mapping:", region_mapping)


# Step 6: Save transformed data
try:
    df.to_csv("transformed_sales_data.csv", index=False)
    print("\nTransformed data saved to 'transformed_sales_data.csv'")
except Exception as e:
    print(f"Error saving transformed data: {e}")


Original Data:
    ORDERNUMBER  QUANTITYORDERED  PRICEEACH  ORDERLINENUMBER    SALES  \
0        10107               30      95.70                2  2871.00   
1        10121               34      81.35                5  2765.90   
2        10134               41      94.74                2  3884.34   
3        10145               45      83.26                6  3746.70   
4        10159               49     100.00               14  5205.27   

         ORDERDATE   STATUS  QTR_ID  MONTH_ID  YEAR_ID  ...  \
0   2/24/2003 0:00  Shipped       1         2     2003  ...   
1    5/7/2003 0:00  Shipped       2         5     2003  ...   
2    7/1/2003 0:00  Shipped       3         7     2003  ...   
3   8/25/2003 0:00  Shipped       3         8     2003  ...   
4  10/10/2003 0:00  Shipped       4        10     2003  ...   

                    ADDRESSLINE1  ADDRESSLINE2           CITY STATE  \
0        897 Long Airport Avenue           NaN            NYC    NY   
1             59 rue de l'Abba

In [4]:
#ex2
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

file_path = "N:\CS2225 DS\Datasets\p2.csv"   

df = None
for enc in ["utf-8", "latin1", "cp1252"]:
    try:
        df = pd.read_csv(file_path, encoding=enc)
        print(f"File read successfully with encoding: {enc}")
        print("Original Data:\n", df.head())
        break
    except Exception as e:
        print(f"Failed with encoding {enc}: {e}")

if df is not None:
    if 'delivery_date' in df.columns:
        df['delivery_date'] = pd.to_datetime(df['delivery_date'], errors='coerce', dayfirst=False)
        df['delivery_date'] = df['delivery_date'].dt.strftime('%Y-%m-%d')

    if 'destination' in df.columns:
        df['destination'] = df['destination'].astype(str).str.title()

    if 'destination' in df.columns:
        le = LabelEncoder()
        df['destination_encoded'] = le.fit_transform(df['destination'])

        destination_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        print("\nDestination Mapping:", destination_mapping)
    

    output_file = "cleaned_shipments.csv"
    df.to_csv(output_file, index=False)
    print(f"\nTransformed data saved to '{output_file}'")


    print("\nTransformed Data (first 5 rows):\n", df.head())

else:
    print("Could not read file with any tested encoding.")


Failed with encoding utf-8: 'utf-8' codec can't decode byte 0x84 in position 8: invalid start byte
File read successfully with encoding: latin1
Original Data:
    ORDERNUMBER  QUANTITYORDERED  PRICEEACH  ORDERLINENUMBER    SALES  \
0        10107               30      95.70                2  2871.00   
1        10121               34      81.35                5  2765.90   
2        10134               41      94.74                2  3884.34   
3        10145               45      83.26                6  3746.70   
4        10159               49     100.00               14  5205.27   

         ORDERDATE   STATUS  QTR_ID  MONTH_ID  YEAR_ID  ...  \
0   2/24/2003 0:00  Shipped       1         2     2003  ...   
1    5/7/2003 0:00  Shipped       2         5     2003  ...   
2    7/1/2003 0:00  Shipped       3         7     2003  ...   
3   8/25/2003 0:00  Shipped       3         8     2003  ...   
4  10/10/2003 0:00  Shipped       4        10     2003  ...   

                    ADDRESSL