In [None]:
#p4
import pandas as pd
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

file_path = "Datasets/Program4_sales_data.csv" 
try:
    df = pd.read_csv(file_path, encoding='latin1')  
    logging.info("Original Sales Data extracted successfully")
    print("Original Data:\n", df.head())
except FileNotFoundError:
    logging.error(f"File '{file_path}' not found. Please check the file path.")
    exit(1)
except pd.errors.EmptyDataError:
    logging.error(f"File '{file_path}' is empty or invalid.")
    exit(1)
except Exception as e:
    logging.error(f"Error reading CSV file: {e}")
    exit(1)

required_columns = ['amount', 'customer_name', 'product_name']
if not all(col in df.columns for col in required_columns):
    logging.error(f"Required columns {required_columns} not found. Available columns: {list(df.columns)}")
    exit(1)


print("\nData Info:\n")
df.info()
print("\nMissing Values:\n", df.isnull().sum())
print("\nDuplicate Rows:", df.duplicated().sum())


try:
    df = df.drop_duplicates()
    df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
    df['amount'] = df['amount'].fillna(df['amount'].median())
    df['customer_name'] = df['customer_name'].astype(str).fillna("Unknown Customer")
    df['product_name'] = df['product_name'].astype(str).str.strip().str.lower()
    df = df[df['amount'] > 0]  # Remove invalid transactions
    logging.info("Data cleaning completed successfully")
except Exception as e:
    logging.error(f"Data cleaning failed: {e}")
    exit(1)


print("\nCleaned Data:\n", df.head())
try:
    df.to_csv("cleaned_sales_data.csv", index=False)
    logging.info("Cleaned data saved to 'cleaned_sales_data.csv'")
except Exception as e:
    logging.error(f"Error saving cleaned data: {e}")
    exit(1)


2025-08-21 22:02:35,641 - INFO - Original Sales Data extracted successfully
2025-08-21 22:02:35,652 - ERROR - Required columns ['amount', 'customer_name', 'product_name'] not found. Available columns: ['ORDERNUMBER', 'QUANTITYORDERED', 'PRICEEACH', 'ORDERLINENUMBER', 'SALES', 'ORDERDATE', 'STATUS', 'QTR_ID', 'MONTH_ID', 'YEAR_ID', 'PRODUCTLINE', 'MSRP', 'PRODUCTCODE', 'CUSTOMERNAME', 'PHONE', 'ADDRESSLINE1', 'ADDRESSLINE2', 'CITY', 'STATE', 'POSTALCODE', 'COUNTRY', 'TERRITORY', 'CONTACTLASTNAME', 'CONTACTFIRSTNAME', 'DEALSIZE']
2025-08-21 22:02:35,668 - ERROR - Data cleaning failed: 'amount'
2025-08-21 22:02:35,693 - INFO - Cleaned data saved to 'cleaned_sales_data.csv'


Original Data:
    ORDERNUMBER  QUANTITYORDERED  PRICEEACH  ORDERLINENUMBER    SALES  \
0        10107               30      95.70                2  2871.00   
1        10121               34      81.35                5  2765.90   
2        10134               41      94.74                2  3884.34   
3        10145               45      83.26                6  3746.70   
4        10159               49     100.00               14  5205.27   

         ORDERDATE   STATUS  QTR_ID  MONTH_ID  YEAR_ID  ...  \
0   2/24/2003 0:00  Shipped       1         2     2003  ...   
1    5/7/2003 0:00  Shipped       2         5     2003  ...   
2    7/1/2003 0:00  Shipped       3         7     2003  ...   
3   8/25/2003 0:00  Shipped       3         8     2003  ...   
4  10/10/2003 0:00  Shipped       4        10     2003  ...   

                    ADDRESSLINE1  ADDRESSLINE2           CITY STATE  \
0        897 Long Airport Avenue           NaN            NYC    NY   
1             59 rue de l'Abba

In [2]:
#ex4
import pandas as pd
import sqlite3
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

file_path = "N:\CS2225 DS\Datasets\Program4_sales_data.csv"

try:
    df = pd.read_csv(file_path, encoding='latin1')
    logging.info("Original Sales Data extracted successfully")
except FileNotFoundError:
    logging.error(f"File '{file_path}' not found. Please check the file path.")
    raise
except pd.errors.EmptyDataError:
    logging.error(f"File '{file_path}' is empty or invalid.")
    raise
except Exception as e:
    logging.error(f"Error reading CSV file: {e}")
    raise

#  Map required columns
df['amount'] = df['SALES']
df['customer_name'] = df['CUSTOMERNAME']
df['product_name'] = df['PRODUCTLINE']

# Initial Info
info_before = {
    "shape": df.shape,
    "missing": df.isnull().sum().to_dict(),
    "duplicates": int(df.duplicated().sum())
}
print("Before Cleaning:\n", info_before)

# Clean data
df = df.drop_duplicates()
df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
df['amount'] = df['amount'].fillna(df['amount'].median())
df['customer_name'] = df['customer_name'].astype(str).str.strip().str.title()
df['customer_name'] = df['customer_name'].replace(r'^\s*$', "Unknown Customer", regex=True)
df['product_name'] = df['product_name'].astype(str).str.strip().str.lower()
df = df[df['amount'] > 0]

# Outlier removal
q1 = df['amount'].quantile(0.25)
q3 = df['amount'].quantile(0.75)
iqr = q3 - q1
lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
df = df[(df['amount'] >= lower) & (df['amount'] <= upper)]

# Final Info
info_after = {
    "shape": df.shape,
    "missing": df.isnull().sum().to_dict(),
    "duplicates": int(df.duplicated().sum()),
    "summary": df['amount'].describe().to_dict()
}
print("\nAfter Cleaning:\n", info_after)


db_path = "Datasets/sales_cleaned.db"
conn = sqlite3.connect(db_path)
df[['amount', 'customer_name', 'product_name']].to_sql("sales", conn, if_exists="replace", index=False)
conn.close()

logging.info(f"Cleaned data successfully saved to {db_path}")



2025-09-04 17:54:17,333 - INFO - Original Sales Data extracted successfully
2025-09-04 17:54:17,372 - INFO - Cleaned data successfully saved to Datasets/sales_cleaned.db


Before Cleaning:
 {'shape': (2823, 28), 'missing': {'ORDERNUMBER': 0, 'QUANTITYORDERED': 0, 'PRICEEACH': 0, 'ORDERLINENUMBER': 0, 'SALES': 0, 'ORDERDATE': 0, 'STATUS': 0, 'QTR_ID': 0, 'MONTH_ID': 0, 'YEAR_ID': 0, 'PRODUCTLINE': 0, 'MSRP': 0, 'PRODUCTCODE': 0, 'CUSTOMERNAME': 0, 'PHONE': 0, 'ADDRESSLINE1': 0, 'ADDRESSLINE2': 2521, 'CITY': 0, 'STATE': 1486, 'POSTALCODE': 76, 'COUNTRY': 0, 'TERRITORY': 1074, 'CONTACTLASTNAME': 0, 'CONTACTFIRSTNAME': 0, 'DEALSIZE': 0, 'amount': 0, 'customer_name': 0, 'product_name': 0}, 'duplicates': 0}

After Cleaning:
 {'shape': (2742, 28), 'missing': {'ORDERNUMBER': 0, 'QUANTITYORDERED': 0, 'PRICEEACH': 0, 'ORDERLINENUMBER': 0, 'SALES': 0, 'ORDERDATE': 0, 'STATUS': 0, 'QTR_ID': 0, 'MONTH_ID': 0, 'YEAR_ID': 0, 'PRODUCTLINE': 0, 'MSRP': 0, 'PRODUCTCODE': 0, 'CUSTOMERNAME': 0, 'PHONE': 0, 'ADDRESSLINE1': 0, 'ADDRESSLINE2': 2448, 'CITY': 0, 'STATE': 1443, 'POSTALCODE': 72, 'COUNTRY': 0, 'TERRITORY': 1043, 'CONTACTLASTNAME': 0, 'CONTACTFIRSTNAME': 0, 'DEALSI