## Data Exploration

In this notebook, we will use Python to determine if there are data quality issues present or if there are any fields that are challenging to understand. 

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Define the path to the data based on input type
def set_csv(input_type):
    valid_types = {
        "users": "../raw-data/USER_TAKEHOME.csv",
        "transactions": "../raw-data/TRANSACTION_TAKEHOME.csv",
        "products": "../raw-data/PRODUCTS_TAKEHOME.csv"
    }
    
    if input_type not in valid_types:
        raise ValueError(f"Invalid type '{input_type}'. Expected one of {list(valid_types.keys())}")
    
    return valid_types[input_type]

In [None]:
# DO THIS: Change csv_type input to either "users", "transactions", or "products"
try:
    csv_type = "transactions" 
    df = pd.read_csv(set_csv(csv_type))
    # confirm the kind of data we're working with
    print(f"Successfully read {csv_type.upper()} CSV file. Continuing with analysis...")
except ValueError as e:
    print(e)

In [None]:
# Display the data types of each column (to be compared to what we know the types should be)
print(df.dtypes) 

In [None]:
# Display the first five rows of the dataframe
print(df.head())

In [None]:
# Ensure columns in the csv files are of the correct data types
# This will work with any of the files we choose to explore

# Convert all columns containing 'DATE' in their name to timezone-aware datetime (UTC)
date_columns = [col for col in df.columns if 'DATE' in col.upper()]  
df[date_columns] = df[date_columns].apply(lambda x: pd.to_datetime(x, errors='coerce', utc=True))

# Convert 'BARCODE' column to Int64 type (if we are looking at data with 'BARCODE' column)
if 'BARCODE' in df.columns:
    df['BARCODE'] = pd.to_numeric(df['BARCODE'], errors='coerce').astype('Int64')  # Ensure NaNs are handled
    
# Convert 'FINAL_QUANTITY' and 'FINAL_SALE' columns to numeric types (if we are looking at data with these columns)
if 'FINAL_QUANTITY' in df.columns:
    df['FINAL_QUANTITY'] = pd.to_numeric(df['FINAL_QUANTITY'], errors='coerce')  # Convert to numeric, coerce errors to NaN
if 'FINAL_SALE' in df.columns:
    df['FINAL_SALE'] = pd.to_numeric(df['FINAL_SALE'], errors='coerce')  # Convert to numeric, coerce errors to NaN

In [None]:
# Show summary of the dataframe
df.info()

In [None]:
# format floating-point numbers to display with two decimal places
if df.select_dtypes(include=['float64', 'float32']).shape[1] > 0:
    pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
# display the first 20 rows of the dataframe
df.head(20)

In [None]:
# Find how many missing values are in each column
df.isnull().sum()

In [None]:
# Find how many unique values are in each column
df.nunique()

In [None]:
# Calculate the correlation matrix between all numeric columns
sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".2f")
plt.show()