In [9]:
from calendar import month
import pandas as pd
import numpy as np
from scipy.stats import zscore, ks_2samp, ttest_ind

import re

# Set seed for reproducibility
np.random.seed(42)

# ============================
# 1. Missing Data
# ============================

# Task 1: Identify missing customer emails
data_missing_email = {
    'CustomerID': [1, 2, 3, 4, 5],
    'Email': ['a@domain.com', None, 'c@domain.com', None, 'e@domain.com']
}
df_missing_email = pd.DataFrame(data_missing_email)
missing_emails = df_missing_email['Email'].isnull().sum()
print(f"Task 1 - Missing emails: {missing_emails}\n")

# Task 2: Examine missing transaction dates
data_missing_date = {
    'TransactionID': [101, 102, 103, 104, 105],
    'TransactionDate': ['2025-01-01', None, '2025-01-03', None, '2025-01-05']
}
df_sales = pd.DataFrame(data_missing_date)
missing_dates_percentage = df_sales['TransactionDate'].isnull().mean() * 100
print(f"Task 2 - Percentage of missing transaction dates: {missing_dates_percentage:.2f}%\n")

# Task 3: Identify missing department information
data_missing_department = {
    'EmployeeID': [1, 2, 3, 4, 5],
    'Department': ['HR', None, 'IT', None, 'Sales']
}
df_employees = pd.DataFrame(data_missing_department)
missing_departments = df_employees['Department'].isnull().sum()
print(f"Task 3 - Missing departments: {missing_departments}\n")

# ============================
# 2. Duplicate Data
# ============================

# Task 1: Identify duplicate customer records
data_duplicates = {
    'CustomerID': [1, 2, 2, 3, 4],
    'Name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David']
}
df_customers = pd.DataFrame(data_duplicates)
duplicate_records = df_customers[df_customers.duplicated()]
print(f"Task 1 - Duplicate records:\n{duplicate_records}\n")

# Task 2: Review phone numbers with varying formats
data_phone_numbers = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'PhoneNumber': ['123-456-7890', '(123) 456-7890', '123.456.7890']
}
df_contacts = pd.DataFrame(data_phone_numbers)
df_contacts['PhoneNumber'] = df_contacts['PhoneNumber'].replace(
    regex=r'(\d{3})[-.\)]*(\d{3})[-.\)]*(\d{4})', 
    value=r'(\1)-\2-\3'
)
print(f"Task 2 - Standardized Phone Numbers:\n{df_contacts}\n")

# Task 3: Review state abbreviations for discrepancies
data_states = {
    'Address': ['123 Main St', '456 Oak St', '789 Pine St'],
    'State': ['CA', 'Calif.', 'CA']
}
df_addresses = pd.DataFrame(data_states)
state_mapping = {
    'Calif.': 'CA'
}
df_addresses['State'] = df_addresses['State'].replace(state_mapping)
print(f"Task 3 - Standardized State Abbreviations:\n{df_addresses}\n")

# ============================
# 3. Inconsistent Data Formats
# ============================

# Task 1: Identify inconsistencies in date formats
data_dates = {
    'EventID': [1, 2, 3],
    'EventDate': ['2025-01-01', '01/02/2025', '03/04/2025']
}
df_dates = pd.DataFrame(data_dates)
df_dates['EventDate'] = pd.to_datetime(df_dates['EventDate'], errors='coerce')
print(f"Task 1 - Standardized Date Formats:\n{df_dates}\n")

# ============================
# 4. Data Drift
# ============================

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']
revenue = [1000, 1100, 1050, 1200, 1250, 1300]  # Original revenue data
df_monthly_revenue = pd.DataFrame({'Month': months, 'Revenue': revenue})

# Generate second dataset with drift (revenue spike in June)
revenue_with_drift = [1000, 1100, 1050, 1200, 1250, 1500]  # Spike in June
df_monthly_revenue_drift = pd.DataFrame({'Month': months, 'Revenue': revenue_with_drift})

# Perform T-test to detect if there's a significant difference (data drift)
t_stat, p_value = ttest_ind(revenue, revenue_with_drift)
print(f"Task 1 - T-Test for Revenue Drift - Statistic: {t_stat:.4f}, p-value: {p_value:.4f}")

# Task 2: Analyze user engagement metrics over different quarters
quarters = ['Q1', 'Q2', 'Q3', 'Q4']
engagement = [5000, 6000, 5500, 7000]  # User engagement data
df_user_engagement = pd.DataFrame({'Quarter': quarters, 'Engagement': engagement})

# Introduce drift (increased engagement in Q4)
engagement_with_drift = [5000, 6000, 5500, 9000]  # Drift in Q4
t_stat_engagement, p_value_engagement = ttest_ind(engagement, engagement_with_drift)
print(f"Task 2 - T-Test for Engagement Drift - Statistic: {t_stat_engagement:.4f}, p-value: {p_value_engagement:.4f}")

# Task 3: Review stock prices for anomalies
months_stock = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']  # 12 months
stock_prices = [100, 105, 110, 115, 120, 125, 130, 125, 135, 140, 145, 150]  # Original stock prices
df_stock_prices = pd.DataFrame({'Month': months_stock, 'StockPrice': stock_prices})

# Introduce anomalies (outlier in June)
stock_prices_with_anomalies = [100, 105, 110, 115, 120, 1000, 130, 125, 135, 140, 145, 150]  # Anomaly in June
t_stat_stock, p_value_stock = ttest_ind(stock_prices, stock_prices_with_anomalies)
print(f"Task 3 - T-Test for Stock Price Anomalies - Statistic: {t_stat_stock:.4f}, p-value: {p_value_stock:.4f}")


# Task 1: Compare monthly revenues over six months




# Generate second dataset wmonths = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']  # 12 months
 # Correct length, 12 values

# Now both lists have the same length


# Introduce anomalies in stock prices (adding a spike in June)
  # Anomaly at June

# Perform KS test for anomaly detection (comparing original vs anomalous data)


# Output the results

 # Increased revenue in June


# Perform KS test for drift



# Task 2: Analyze user engagement metrics over different quarters

 # Example user engagement


# Generate second dataset with drift
 # Drift in Q4



# Task 3: Review stock prices for anomalies
 # Example stock prices


# Introduce anomalies in stock prices




Task 1 - Missing emails: 2

Task 2 - Percentage of missing transaction dates: 40.00%

Task 3 - Missing departments: 2

Task 1 - Duplicate records:
   CustomerID Name
2           2  Bob

Task 2 - Standardized Phone Numbers:
      Name     PhoneNumber
0    Alice  (123)-456-7890
1      Bob  (123) 456-7890
2  Charlie  (123)-456-7890

Task 3 - Standardized State Abbreviations:
       Address State
0  123 Main St    CA
1   456 Oak St    CA
2  789 Pine St    CA

Task 1 - Standardized Date Formats:
   EventID  EventDate
0        1 2025-01-01
1        2        NaT
2        3        NaT

Task 1 - T-Test for Revenue Drift - Statistic: -0.3780, p-value: 0.7134
Task 2 - T-Test for Engagement Drift - Statistic: -0.5026, p-value: 0.6331
Task 3 - T-Test for Stock Price Anomalies - Statistic: -0.9961, p-value: 0.3300
