In [4]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
df = pd.read_csv('AWCustomers.csv')
print(f"Dataset shape: {df.shape}")
print(f"Dataset columns: {list(df.columns)}")
print("\nFirst 5 rows:")
df.head()


Dataset shape: (18361, 24)
Dataset columns: ['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix', 'AddressLine1', 'AddressLine2', 'City', 'StateProvinceName', 'CountryRegionName', 'PostalCode', 'PhoneNumber', 'BirthDate', 'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome', 'LastUpdated']

First 5 rows:


Unnamed: 0,CustomerID,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,...,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,LastUpdated
0,21173,,Chad,C,Yuan,,7090 C. Mount Hood,,Wollongong,New South Wales,...,Bachelors,Clerical,M,M,1,3,0,1,81916,2017-03-06
1,13249,,Ryan,,Perry,,3651 Willow Lake Rd,,Shawnee,British Columbia,...,Partial College,Clerical,M,M,1,2,1,2,81076,2017-03-06
2,29350,,Julia,,Thompson,,1774 Tice Valley Blvd.,,West Covina,California,...,Bachelors,Clerical,F,S,0,3,0,0,86387,2017-03-06
3,13503,,Theodore,,Gomez,,2103 Baldwin Dr,,Liverpool,England,...,Partial College,Skilled Manual,M,M,1,2,1,2,61481,2017-03-06
4,22803,,Marshall,J,Shan,,Am Gallberg 234,,Werne,Nordrhein-Westfalen,...,Partial College,Skilled Manual,M,S,1,1,0,0,51804,2017-03-06


In [6]:
# Part I(a): Examine the dataset and select relevant features for bike buying prediction

print("=== DATASET EXPLORATION ===")
print(f"Dataset Info:")
df.info()

print(f"\nDataset Description:")
df.describe()

print(f"\nMissing Values:")
print(df.isnull().sum())

print(f"\nUnique values in each column:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

# CORRECTED: Select features that actually exist in the dataset
selected_features = [
    'CustomerID',           # Identifier
    'BirthDate',           # Age affects buying behavior
    'Education',           # Education level affects income and preferences
    'Occupation',          # Job type affects income and lifestyle
    'Gender',              # Gender preferences for bikes
    'MaritalStatus',       # Family status affects bike needs
    'HomeOwnerFlag',       # Home ownership indicates financial stability
    'NumberCarsOwned',     # Transportation preferences
    'NumberChildrenAtHome', # Family size affects bike needs
    'TotalChildren',       # Total family size
    'YearlyIncome',        # Primary factor for bike purchasing power
    'CountryRegionName',   # Geographic location affects preferences
    'StateProvinceName'    # State-level preferences
]

print(f"\n=== SELECTED FEATURES FOR BIKE BUYING PREDICTION ===")
print(f"Selected {len(selected_features)} features:")
for i, feature in enumerate(selected_features, 1):
    print(f"{i}. {feature}")

print(f"\nRationale for feature selection:")
print("- Demographics (Age, Gender, Education, Occupation): Core customer profiling")
print("- Financial indicators (YearlyIncome, HomeOwnerFlag): Purchasing power")
print("- Family structure (MaritalStatus, Children): Bike usage needs")
print("- Transportation (NumberCarsOwned): Current transport ownership")
print("- Geographic (CountryRegion, State): Location-based preferences")

# Check if all selected features exist
missing_features = [feature for feature in selected_features if feature not in df.columns]
if missing_features:
    print(f"\n⚠️ Missing features: {missing_features}")
else:
    print(f"\n✅ All selected features are present in the dataset")


=== DATASET EXPLORATION ===
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18361 entries, 0 to 18360
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   CustomerID            18361 non-null  int64 
 1   Title                 101 non-null    object
 2   FirstName             18361 non-null  object
 3   MiddleName            10572 non-null  object
 4   LastName              18361 non-null  object
 5   Suffix                3 non-null      object
 6   AddressLine1          18361 non-null  object
 7   AddressLine2          311 non-null    object
 8   City                  18361 non-null  object
 9   StateProvinceName     18361 non-null  object
 10  CountryRegionName     18361 non-null  object
 11  PostalCode            18361 non-null  object
 12  PhoneNumber           18361 non-null  object
 13  BirthDate             18361 non-null  object
 14  Education             18361 non-null  object

In [7]:
# Part I(b): Create new DataFrame with selected attributes only

# Check if all selected features exist in the dataset
missing_features = [feature for feature in selected_features if feature not in df.columns]
if missing_features:
    print(f"Warning: Missing features in dataset: {missing_features}")
    selected_features = [feature for feature in selected_features if feature in df.columns]

# Create new DataFrame with selected features
df_selected = df[selected_features].copy()

print(f"=== NEW DATAFRAME WITH SELECTED FEATURES ===")
print(f"Shape of selected dataset: {df_selected.shape}")
print(f"Selected features: {list(df_selected.columns)}")

print(f"\nSample of selected data:")
df_selected.head(10)


=== NEW DATAFRAME WITH SELECTED FEATURES ===
Shape of selected dataset: (18361, 13)
Selected features: ['CustomerID', 'BirthDate', 'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome', 'CountryRegionName', 'StateProvinceName']

Sample of selected data:


Unnamed: 0,CustomerID,BirthDate,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,CountryRegionName,StateProvinceName
0,21173,1987-11-13,Bachelors,Clerical,M,M,1,3,0,1,81916,Australia,New South Wales
1,13249,1972-07-21,Partial College,Clerical,M,M,1,2,1,2,81076,Canada,British Columbia
2,29350,1985-11-09,Bachelors,Clerical,F,S,0,3,0,0,86387,United States,California
3,13503,1977-10-18,Partial College,Skilled Manual,M,M,1,2,1,2,61481,United Kingdom,England
4,22803,1975-02-05,Partial College,Skilled Manual,M,S,1,1,0,0,51804,Germany,Nordrhein-Westfalen
5,22092,1975-02-23,High School,Skilled Manual,F,M,1,2,2,2,61944,United States,Washington
6,11229,1971-12-03,Partial College,Manual,M,S,1,1,0,0,34919,United States,California
7,24179,1997-03-07,Partial College,Skilled Manual,F,M,0,1,0,0,61832,Australia,New South Wales
8,12127,1976-12-14,Partial College,Clerical,F,M,1,0,0,2,83834,France,Nord
9,19903,1976-12-04,High School,Manual,M,S,0,1,0,0,26880,France,Moselle


In [8]:
# Part I(c): Determine data value types for each attribute

print("=== DATA TYPE ANALYSIS ===")

# Create a comprehensive data type analysis
data_type_analysis = []

for column in df_selected.columns:
    dtype = df_selected[column].dtype
    unique_values = df_selected[column].nunique()
    sample_values = df_selected[column].dropna().head(5).tolist()

    # Determine measurement scale
    if column == 'CustomerID':
        measurement_scale = "Nominal (Identifier)"
        data_category = "Discrete"
    elif column in ['BirthDate']:
        measurement_scale = "Interval"
        data_category = "Continuous"
    elif column in ['YearlyIncome']:
        measurement_scale = "Ratio"
        data_category = "Continuous"
    elif column in ['NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'HomeOwnerFlag']:
        measurement_scale = "Ratio"
        data_category = "Discrete"
    elif column in ['Education']:
        measurement_scale = "Ordinal"
        data_category = "Discrete"
    elif column in ['Gender', 'MaritalStatus', 'Occupation', 'CountryRegionName', 'StateProvinceName']:
        measurement_scale = "Nominal"
        data_category = "Discrete"
    else:
        measurement_scale = "To be determined"
        data_category = "To be determined"

    data_type_analysis.append({
        'Feature': column,
        'Data_Type': str(dtype),
        'Unique_Values': unique_values,
        'Data_Category': data_category,
        'Measurement_Scale': measurement_scale,
        'Sample_Values': str(sample_values)
    })

# Create DataFrame for analysis
type_analysis_df = pd.DataFrame(data_type_analysis)
print(type_analysis_df.to_string(index=False))

print(f"\n=== PREPROCESSING TASKS IDENTIFICATION ===")
print("Based on data types, required preprocessing tasks:")
print("1. Handle missing values in all attributes")
print("2. Convert BirthDate to Age (continuous → ratio)")
print("3. Normalize/Standardize continuous attributes (YearlyIncome, Age)")
print("4. Discretize continuous attributes if needed (binning)")
print("5. One-hot encode nominal categorical attributes")
print("6. Handle ordinal attributes (Education) with proper encoding")
print("⚠️ Note: CommuteDistance column not found in dataset - will use alternative for correlation analysis")


=== DATA TYPE ANALYSIS ===
             Feature Data_Type  Unique_Values Data_Category    Measurement_Scale                                                                           Sample_Values
          CustomerID     int64          18355      Discrete Nominal (Identifier)                                                     [21173, 13249, 29350, 13503, 22803]
           BirthDate    object           8230    Continuous             Interval                  ['1987-11-13', '1972-07-21', '1985-11-09', '1977-10-18', '1975-02-05']
           Education    object              5      Discrete              Ordinal     ['Bachelors', 'Partial College', 'Bachelors', 'Partial College', 'Partial College']
          Occupation    object              5      Discrete              Nominal                ['Clerical', 'Clerical', 'Clerical', 'Skilled Manual', 'Skilled Manual']
              Gender    object              2      Discrete              Nominal                                                

In [9]:
# Part II(a): Handling Null Values

print("=== HANDLING NULL VALUES ===")

print("Missing values before cleaning:")
missing_before = df_selected.isnull().sum()
print(missing_before[missing_before > 0])

# Strategy for handling missing values
df_processed = df_selected.copy()

# Handle missing values based on data type and distribution
for column in df_processed.columns:
    missing_count = df_processed[column].isnull().sum()
    if missing_count > 0:
        print(f"\nHandling {missing_count} missing values in {column}")

        if column in ['YearlyIncome']:
            # Use mean for income (ratio scale)
            mean_value = df_processed[column].mean()
            df_processed[column].fillna(mean_value, inplace=True)
            print(f"  → Filled with mean: ${mean_value:,.2f}")

        elif column in ['NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren']:
            # Use median for count variables
            median_value = df_processed[column].median()
            df_processed[column].fillna(median_value, inplace=True)
            print(f"  → Filled with median: {median_value}")

        elif column in ['Education', 'Occupation', 'Gender', 'MaritalStatus']:
            # Use mode for categorical variables
            mode_value = df_processed[column].mode()[0]
            df_processed[column].fillna(mode_value, inplace=True)
            print(f"  → Filled with mode: {mode_value}")

        elif column == 'BirthDate':
            # Fill with median date
            median_date = df_processed[column].dropna().sort_values().iloc[len(df_processed[column].dropna())//2]
            df_processed[column].fillna(median_date, inplace=True)
            print(f"  → Filled with median date: {median_date}")

        else:
            # For other categorical variables, use mode
            if df_processed[column].mode().empty:
                df_processed[column].fillna('Unknown', inplace=True)
                print(f"  → Filled with 'Unknown'")
            else:
                mode_value = df_processed[column].mode()[0]
                df_processed[column].fillna(mode_value, inplace=True)
                print(f"  → Filled with mode: {mode_value}")

print(f"\nMissing values after cleaning:")
missing_after = df_processed.isnull().sum()
print(missing_after[missing_after > 0])

print(f"\nAll missing values handled successfully!")


=== HANDLING NULL VALUES ===
Missing values before cleaning:
Series([], dtype: int64)

Missing values after cleaning:
Series([], dtype: int64)

All missing values handled successfully!


In [10]:
# Part II(b): Calculate Age from BirthDate and perform initial transformations

print("=== AGE CALCULATION AND INITIAL TRANSFORMATIONS ===")

# Convert BirthDate to Age
if 'BirthDate' in df_processed.columns:
    print("Converting BirthDate to Age...")

    # Convert to datetime
    df_processed['BirthDate'] = pd.to_datetime(df_processed['BirthDate'], errors='coerce')

    # Calculate age
    current_date = pd.Timestamp.now()
    df_processed['Age'] = (current_date - df_processed['BirthDate']).dt.days // 365

    # Clean unrealistic ages
    df_processed['Age'] = df_processed['Age'].clip(0, 120)

    print(f"Age statistics:")
    print(df_processed['Age'].describe())

    # Drop BirthDate as we now have Age
    df_processed.drop('BirthDate', axis=1, inplace=True)
    print("BirthDate column dropped, Age column added")

print(f"\nCurrent dataset shape: {df_processed.shape}")
print(f"Current columns: {list(df_processed.columns)}")


=== AGE CALCULATION AND INITIAL TRANSFORMATIONS ===
Converting BirthDate to Age...
Age statistics:
count    18361.000000
mean        43.149883
std         11.269639
min         24.000000
25%         34.000000
50%         42.000000
75%         51.000000
max         95.000000
Name: Age, dtype: float64
BirthDate column dropped, Age column added

Current dataset shape: (18361, 13)
Current columns: ['CustomerID', 'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome', 'CountryRegionName', 'StateProvinceName', 'Age']


In [11]:
# Part II(c): Discretization (Binning) on Continuous attributes

print("=== DISCRETIZATION (BINNING) ===")

# Identify continuous attributes for binning
continuous_attributes = ['Age', 'YearlyIncome']

# Create binned versions
df_binned = df_processed.copy()

for attr in continuous_attributes:
    if attr in df_binned.columns:
        print(f"\n--- Binning {attr} ---")

        if attr == 'Age':
            # Age groups: Young (18-30), Adult (31-50), Middle-aged (51-65), Senior (65+)
            bins = [0, 30, 50, 65, 120]
            labels = ['Young', 'Adult', 'Middle_Aged', 'Senior']
            df_binned[f'{attr}_binned'] = pd.cut(df_binned[attr], bins=bins, labels=labels, include_lowest=True)

        elif attr == 'YearlyIncome':
            # Income quintiles
            df_binned[f'{attr}_binned'] = pd.qcut(df_binned[attr], q=5, labels=['Low', 'Low_Mid', 'Middle', 'Mid_High', 'High'], duplicates='drop')

        print(f"{attr} binning completed:")
        print(df_binned[f'{attr}_binned'].value_counts().sort_index())

# Also bin other attributes with many categories
if 'CommuteDistance' in df_binned.columns:
    print(f"\n--- Mapping CommuteDistance ---")
    distance_mapping = {
        '0-1 Miles': 1, '1-2 Miles': 2, '2-5 Miles': 3,
        '5-10 Miles': 4, '10+ Miles': 5
    }
    df_binned['CommuteDistance_numeric'] = df_binned['CommuteDistance'].map(distance_mapping)
    print("CommuteDistance mapped to numeric values")

print(f"\nDataset shape after binning: {df_binned.shape}")


=== DISCRETIZATION (BINNING) ===

--- Binning Age ---
Age binning completed:
Age_binned
Young           2382
Adult          11383
Middle_Aged     3921
Senior           675
Name: count, dtype: int64

--- Binning YearlyIncome ---
YearlyIncome binning completed:
YearlyIncome_binned
Low         3673
Low_Mid     3672
Middle      3673
Mid_High    3671
High        3672
Name: count, dtype: int64

Dataset shape after binning: (18361, 15)


In [12]:
# Part II(d): Standardization/Normalization

print("=== STANDARDIZATION AND NORMALIZATION ===")

# Separate numeric and categorical columns (CORRECTED - removed CommuteDistance_numeric)
numeric_columns = ['Age', 'YearlyIncome', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'HomeOwnerFlag']

# Filter to existing columns
numeric_columns = [col for col in numeric_columns if col in df_binned.columns]

print(f"Numeric columns for standardization: {numeric_columns}")

# Create standardized dataset
df_standardized = df_binned.copy()

# Min-Max Normalization (0-1 scale)
print("\n--- Min-Max Normalization ---")
scaler_minmax = MinMaxScaler()
df_standardized[numeric_columns] = scaler_minmax.fit_transform(df_standardized[numeric_columns])

print("Min-Max normalization completed:")
for col in numeric_columns:
    print(f"{col}: min={df_standardized[col].min():.3f}, max={df_standardized[col].max():.3f}")

# Z-Score Standardization (separate dataset for comparison)
print("\n--- Z-Score Standardization ---")
df_zscore = df_binned.copy()
scaler_standard = StandardScaler()
df_zscore[numeric_columns] = scaler_standard.fit_transform(df_zscore[numeric_columns])

print("Z-Score standardization completed:")
for col in numeric_columns:
    print(f"{col}: mean={df_zscore[col].mean():.3f}, std={df_zscore[col].std():.3f}")

# Keep the Min-Max normalized version for further processing
df_final_prep = df_standardized.copy()
print(f"\nUsing Min-Max normalized data for further processing")


=== STANDARDIZATION AND NORMALIZATION ===
Numeric columns for standardization: ['Age', 'YearlyIncome', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'HomeOwnerFlag']

--- Min-Max Normalization ---
Min-Max normalization completed:
Age: min=0.000, max=1.000
YearlyIncome: min=0.000, max=1.000
NumberCarsOwned: min=0.000, max=1.000
NumberChildrenAtHome: min=0.000, max=1.000
TotalChildren: min=0.000, max=1.000
HomeOwnerFlag: min=0.000, max=1.000

--- Z-Score Standardization ---
Z-Score standardization completed:
Age: mean=-0.000, std=1.000
YearlyIncome: mean=0.000, std=1.000
NumberCarsOwned: mean=-0.000, std=1.000
NumberChildrenAtHome: mean=-0.000, std=1.000
TotalChildren: mean=-0.000, std=1.000
HomeOwnerFlag: mean=0.000, std=1.000

Using Min-Max normalized data for further processing


In [13]:
# Part II(e): Binarization (One Hot Encoding)

print("=== BINARIZATION (ONE HOT ENCODING) ===")

# Identify categorical columns for one-hot encoding
categorical_columns = []
for col in df_final_prep.columns:
    if df_final_prep[col].dtype == 'object' or col.endswith('_binned'):
        categorical_columns.append(col)

# Remove identifier columns from encoding
categorical_columns = [col for col in categorical_columns if col != 'CustomerID']

print(f"Categorical columns for one-hot encoding: {categorical_columns}")

# Apply one-hot encoding
df_encoded = df_final_prep.copy()

for col in categorical_columns:
    if col in df_encoded.columns:
        print(f"\nEncoding {col}:")
        unique_values = df_encoded[col].unique()
        print(f"  Unique values: {len(unique_values)} - {list(unique_values)[:5]}{'...' if len(unique_values) > 5 else ''}")

        # Create dummy variables
        dummies = pd.get_dummies(df_encoded[col], prefix=col, drop_first=True)

        # Add to dataframe
        df_encoded = pd.concat([df_encoded, dummies], axis=1)

        # Drop original column
        df_encoded.drop(col, axis=1, inplace=True)

        print(f"  Created {len(dummies.columns)} binary features")

print(f"\n=== FINAL PREPROCESSED DATASET ===")
print(f"Shape: {df_encoded.shape}")
print(f"Total features: {len(df_encoded.columns)}")

# Show feature types
binary_features = [col for col in df_encoded.columns if df_encoded[col].dtype == 'uint8']
numeric_features = [col for col in df_encoded.columns if df_encoded[col].dtype in ['float64', 'int64'] and col != 'CustomerID']

print(f"\nFeature breakdown:")
print(f"- Binary features: {len(binary_features)}")
print(f"- Numeric features: {len(numeric_features)}")
print(f"- Identifier: CustomerID")

# Save the preprocessed dataset
df_final = df_encoded.copy()
print(f"\nPreprocessing completed successfully!")
df_final.head()


=== BINARIZATION (ONE HOT ENCODING) ===
Categorical columns for one-hot encoding: ['Education', 'Occupation', 'Gender', 'MaritalStatus', 'CountryRegionName', 'StateProvinceName', 'Age_binned', 'YearlyIncome_binned']

Encoding Education:
  Unique values: 5 - ['Bachelors', 'Partial College', 'High School', 'Partial High School', 'Graduate Degree']
  Created 4 binary features

Encoding Occupation:
  Unique values: 5 - ['Clerical', 'Skilled Manual', 'Manual', 'Management', 'Professional']
  Created 4 binary features

Encoding Gender:
  Unique values: 2 - ['M', 'F']
  Created 1 binary features

Encoding MaritalStatus:
  Unique values: 2 - ['M', 'S']
  Created 1 binary features

Encoding CountryRegionName:
  Unique values: 6 - ['Australia', 'Canada', 'United States', 'United Kingdom', 'Germany']...
  Created 5 binary features

Encoding StateProvinceName:
  Unique values: 54 - ['New South Wales', 'British Columbia', 'California', 'England', 'Nordrhein-Westfalen']...
  Created 53 binary featur

Unnamed: 0,CustomerID,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,Age,Education_Graduate Degree,Education_High School,Education_Partial College,...,StateProvinceName_Washington,StateProvinceName_Wyoming,StateProvinceName_Yveline,Age_binned_Adult,Age_binned_Middle_Aged,Age_binned_Senior,YearlyIncome_binned_Low_Mid,YearlyIncome_binned_Middle,YearlyIncome_binned_Mid_High,YearlyIncome_binned_High
0,21173,1.0,0.6,0.0,0.333333,0.496842,0.183099,False,False,False,...,False,False,False,True,False,False,False,False,True,False
1,13249,1.0,0.4,0.333333,0.666667,0.489453,0.408451,False,False,True,...,False,False,False,False,True,False,False,False,True,False
2,29350,0.0,0.6,0.0,0.0,0.536172,0.211268,False,False,False,...,False,False,False,True,False,False,False,False,True,False
3,13503,1.0,0.4,0.333333,0.666667,0.317083,0.323944,False,False,True,...,False,False,False,True,False,False,False,True,False,False
4,22803,1.0,0.2,0.0,0.0,0.231958,0.366197,False,False,True,...,False,False,False,True,False,False,True,False,False,False


In [14]:
# Part III(a): Calculate Similarity measures between two objects

print("=== SIMILARITY ANALYSIS BETWEEN TWO OBJECTS ===")

# Select first two customers for comparison
if len(df_final) >= 2:
    obj1 = df_final.iloc[0]  # First customer
    obj2 = df_final.iloc[1]  # Second customer

    print(f"Comparing Customer {obj1['CustomerID']} and Customer {obj2['CustomerID']}")

    # Separate features by type
    binary_features = [col for col in df_final.columns if df_final[col].dtype == 'uint8']
    numeric_features = [col for col in df_final.columns if df_final[col].dtype in ['float64', 'int64'] and col != 'CustomerID']

    print(f"\nFeature categories:")
    print(f"- Binary features: {len(binary_features)}")
    print(f"- Numeric features: {len(numeric_features)}")

    # Extract feature vectors
    obj1_binary = obj1[binary_features].values if binary_features else np.array([])
    obj2_binary = obj2[binary_features].values if binary_features else np.array([])
    obj1_numeric = obj1[numeric_features].values if numeric_features else np.array([])
    obj2_numeric = obj2[numeric_features].values if numeric_features else np.array([])

    print(f"\n=== BINARY FEATURE SIMILARITIES ===")

    if len(binary_features) > 0:
        # Simple Matching Similarity
        matches = np.sum(obj1_binary == obj2_binary)
        total = len(obj1_binary)
        simple_matching = matches / total

        print(f"Simple Matching Similarity:")
        print(f"  Matches: {matches}/{total}")
        print(f"  Similarity: {simple_matching:.4f}")

        # Jaccard Similarity
        # For binary data: intersection over union
        both_true = np.sum((obj1_binary == 1) & (obj2_binary == 1))
        either_true = np.sum((obj1_binary == 1) | (obj2_binary == 1))

        if either_true == 0:
            jaccard_sim = 1.0  # Both vectors are all zeros
        else:
            jaccard_sim = both_true / either_true

        print(f"\nJaccard Similarity:")
        print(f"  Both True: {both_true}")
        print(f"  Either True: {either_true}")
        print(f"  Jaccard: {jaccard_sim:.4f}")

        # Alternative Jaccard using sklearn
        try:
            jaccard_sklearn = jaccard_score(obj1_binary, obj2_binary, average='micro')
            print(f"  Jaccard (sklearn): {jaccard_sklearn:.4f}")
        except:
            print(f"  Jaccard (sklearn): Could not calculate")

    else:
        print("No binary features available for similarity calculation")

    print(f"\n=== NUMERIC FEATURE SIMILARITIES ===")

    if len(numeric_features) > 0:
        # Cosine Similarity
        obj1_num_reshaped = obj1_numeric.reshape(1, -1)
        obj2_num_reshaped = obj2_numeric.reshape(1, -1)

        # Handle zero vectors
        norm1 = np.linalg.norm(obj1_numeric)
        norm2 = np.linalg.norm(obj2_numeric)

        if norm1 == 0 or norm2 == 0:
            cosine_sim = 0.0
            print(f"Cosine Similarity: {cosine_sim:.4f} (zero vector detected)")
        else:
            cosine_sim = cosine_similarity(obj1_num_reshaped, obj2_num_reshaped)[0][0]
            print(f"Cosine Similarity: {cosine_sim:.4f}")

        # Euclidean Distance (for reference)
        euclidean_dist = np.linalg.norm(obj1_numeric - obj2_numeric)
        print(f"Euclidean Distance: {euclidean_dist:.4f}")

        # Manhattan Distance (for reference)
        manhattan_dist = np.sum(np.abs(obj1_numeric - obj2_numeric))
        print(f"Manhattan Distance: {manhattan_dist:.4f}")

    else:
        print("No numeric features available for similarity calculation")

    print(f"\n=== SUMMARY ===")
    if len(binary_features) > 0:
        print(f"Simple Matching Similarity: {simple_matching:.4f}")
        print(f"Jaccard Similarity: {jaccard_sim:.4f}")
    if len(numeric_features) > 0:
        print(f"Cosine Similarity: {cosine_sim:.4f}")

else:
    print("Dataset has fewer than 2 rows. Cannot perform similarity analysis.")


=== SIMILARITY ANALYSIS BETWEEN TWO OBJECTS ===
Comparing Customer 21173 and Customer 13249

Feature categories:
- Binary features: 0
- Numeric features: 6

=== BINARY FEATURE SIMILARITIES ===
No binary features available for similarity calculation

=== NUMERIC FEATURE SIMILARITIES ===
Cosine Similarity: 0.9234
Euclidean Distance: 0.5595
Manhattan Distance: 1.0994

=== SUMMARY ===
Cosine Similarity: 0.9234


In [15]:
# Part III(b): Calculate Correlation between CommuteDistance and YearlyIncome

print("=== CORRELATION ANALYSIS: COMMUTE DISTANCE vs YEARLY INCOME ===")

# We need to use the original data or the numeric mapping for correlation
correlation_data = df_processed.copy()  # Use the version before standardization

if 'CommuteDistance' in df.columns and 'YearlyIncome' in correlation_data.columns:

    print("Original CommuteDistance values:")
    print(df['CommuteDistance'].value_counts())

    # Create numeric mapping for CommuteDistance
    distance_mapping = {
        '0-1 Miles': 0.5,
        '1-2 Miles': 1.5,
        '2-5 Miles': 3.5,
        '5-10 Miles': 7.5,
        '10+ Miles': 15
    }

    # Map CommuteDistance to numeric values
    commute_numeric = df['CommuteDistance'].map(distance_mapping)
    yearly_income = correlation_data['YearlyIncome']

    # Remove any NaN values
    valid_indices = ~(commute_numeric.isna() | yearly_income.isna())
    commute_clean = commute_numeric[valid_indices]
    income_clean = yearly_income[valid_indices]

    print(f"\nData for correlation analysis:")
    print(f"Valid pairs: {len(commute_clean)}")
    print(f"CommuteDistance range: {commute_clean.min():.1f} - {commute_clean.max():.1f} miles")
    print(f"YearlyIncome range: ${income_clean.min():,.0f} - ${income_clean.max():,.0f}")

    # Calculate Pearson correlation
    correlation = commute_clean.corr(income_clean)

    print(f"\n=== CORRELATION RESULTS ===")
    print(f"Pearson Correlation Coefficient: {correlation:.4f}")

    # Interpret correlation strength
    if abs(correlation) < 0.1:
        strength = "Very weak"
    elif abs(correlation) < 0.3:
        strength = "Weak"
    elif abs(correlation) < 0.5:
        strength = "Moderate"
    elif abs(correlation) < 0.7:
        strength = "Strong"
    else:
        strength = "Very strong"

    direction = "positive" if correlation > 0 else "negative"

    print(f"Correlation strength: {strength} {direction} correlation")

    # Statistical significance (basic test)
    n = len(commute_clean)
    if n > 2:
        # Calculate t-statistic for correlation
        t_stat = correlation * np.sqrt((n-2)/(1-correlation**2)) if correlation != 1 else float('inf')
        print(f"Sample size: {n}")
        print(f"T-statistic: {t_stat:.4f}")

    # Create summary statistics
    print(f"\n=== SUMMARY STATISTICS ===")

    # Group by commute distance
    commute_income_df = pd.DataFrame({
        'CommuteDistance': df['CommuteDistance'][valid_indices],
        'CommuteDistance_Numeric': commute_clean,
        'YearlyIncome': income_clean
    })

    summary_stats = commute_income_df.groupby('CommuteDistance')['YearlyIncome'].agg(['count', 'mean', 'std', 'min', 'max'])
    print("\nIncome statistics by Commute Distance:")
    print(summary_stats.round(2))

    # Interpretation
    print(f"\n=== INTERPRETATION ===")
    if correlation > 0:
        print("Positive correlation suggests that customers with longer commute distances")
        print("tend to have higher yearly income.")
    elif correlation < 0:
        print("Negative correlation suggests that customers with longer commute distances")
        print("tend to have lower yearly income.")
    else:
        print("No linear relationship found between commute distance and yearly income.")

    print(f"\nThis correlation analysis can help Adventure Works Cycles understand")
    print(f"the relationship between customer location (commute patterns) and")
    print(f"purchasing power (income) for targeted marketing campaigns.")

else:
    print("Required columns not found:")
    print(f"CommuteDistance in original data: {'CommuteDistance' in df.columns}")
    print(f"YearlyIncome in processed data: {'YearlyIncome' in correlation_data.columns}")


=== CORRELATION ANALYSIS: COMMUTE DISTANCE vs YEARLY INCOME ===
Required columns not found:
CommuteDistance in original data: False
YearlyIncome in processed data: True


In [16]:
# Part III(b): CORRECTED - Alternative Correlation Analysis
# Since CommuteDistance is not available, we'll analyze correlation between other relevant features

print("=== CORRELATION ANALYSIS (CORRECTED) ===")
print("⚠️ CommuteDistance column not found in dataset")
print("Performing alternative correlation analysis between NumberCarsOwned and YearlyIncome")

# Use the original data before standardization for meaningful correlation
correlation_data = df_processed.copy()

if 'NumberCarsOwned' in correlation_data.columns and 'YearlyIncome' in correlation_data.columns:

    print("\nOriginal NumberCarsOwned distribution:")
    print(correlation_data['NumberCarsOwned'].value_counts().sort_index())

    # Extract the two variables
    cars_owned = correlation_data['NumberCarsOwned']
    yearly_income = correlation_data['YearlyIncome']

    # Remove any NaN values
    valid_indices = ~(cars_owned.isna() | yearly_income.isna())
    cars_clean = cars_owned[valid_indices]
    income_clean = yearly_income[valid_indices]

    print(f"\nData for correlation analysis:")
    print(f"Valid pairs: {len(cars_clean)}")
    print(f"NumberCarsOwned range: {cars_clean.min()} - {cars_clean.max()} cars")
    print(f"YearlyIncome range: ${income_clean.min():,.0f} - ${income_clean.max():,.0f}")

    # Calculate Pearson correlation
    correlation = cars_clean.corr(income_clean)

    print(f"\n=== CORRELATION RESULTS ===")
    print(f"Pearson Correlation Coefficient: {correlation:.4f}")

    # Interpret correlation strength
    if abs(correlation) < 0.1:
        strength = "Very weak"
    elif abs(correlation) < 0.3:
        strength = "Weak"
    elif abs(correlation) < 0.5:
        strength = "Moderate"
    elif abs(correlation) < 0.7:
        strength = "Strong"
    else:
        strength = "Very strong"

    direction = "positive" if correlation > 0 else "negative"

    print(f"Correlation strength: {strength} {direction} correlation")

    # Statistical significance (basic test)
    n = len(cars_clean)
    if n > 2:
        # Calculate t-statistic for correlation
        if abs(correlation) < 1:
            t_stat = correlation * np.sqrt((n-2)/(1-correlation**2))
        else:
            t_stat = float('inf')
        print(f"Sample size: {n}")
        print(f"T-statistic: {t_stat:.4f}")

    # Create summary statistics
    print(f"\n=== SUMMARY STATISTICS ===")

    # Group by number of cars owned
    cars_income_df = pd.DataFrame({
        'NumberCarsOwned': cars_clean,
        'YearlyIncome': income_clean
    })

    summary_stats = cars_income_df.groupby('NumberCarsOwned')['YearlyIncome'].agg(['count', 'mean', 'std', 'min', 'max'])
    print("\nIncome statistics by Number of Cars Owned:")
    print(summary_stats.round(2))

    # Additional correlations with other features
    print(f"\n=== ADDITIONAL CORRELATION ANALYSIS ===")

    # Correlation matrix for numeric features
    numeric_features_for_corr = ['Age', 'YearlyIncome', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'HomeOwnerFlag']
    available_features = [col for col in numeric_features_for_corr if col in correlation_data.columns]

    corr_matrix = correlation_data[available_features].corr()
    print("\nCorrelation Matrix:")
    print(corr_matrix.round(4))

    # Interpretation
    print(f"\n=== INTERPRETATION ===")
    if correlation > 0:
        print("Positive correlation suggests that customers who own more cars")
        print("tend to have higher yearly income.")
    elif correlation < 0:
        print("Negative correlation suggests that customers who own more cars")
        print("tend to have lower yearly income.")
    else:
        print("No linear relationship found between number of cars owned and yearly income.")

    print(f"\nThis correlation analysis can help Adventure Works Cycles understand")
    print(f"the relationship between customer transportation preferences")
    print(f"and purchasing power for targeted marketing campaigns.")

# Alternative: Show correlation with HomeOwnerFlag
print(f"\n=== BONUS: HOME OWNERSHIP vs INCOME CORRELATION ===")
if 'HomeOwnerFlag' in correlation_data.columns:
    home_income_corr = correlation_data['HomeOwnerFlag'].corr(correlation_data['YearlyIncome'])
    print(f"Correlation between Home Ownership and Yearly Income: {home_income_corr:.4f}")

    # Group statistics
    home_stats = correlation_data.groupby('HomeOwnerFlag')['YearlyIncome'].agg(['count', 'mean', 'std'])
    home_stats.index = ['Renter', 'Home Owner']
    print(f"\nIncome statistics by Home Ownership:")
    print(home_stats.round(2))


=== CORRELATION ANALYSIS (CORRECTED) ===
⚠️ CommuteDistance column not found in dataset
Performing alternative correlation analysis between NumberCarsOwned and YearlyIncome

Original NumberCarsOwned distribution:
NumberCarsOwned
0    3927
1    7455
2    5085
3    1880
4      11
5       3
Name: count, dtype: int64

Data for correlation analysis:
Valid pairs: 18361
NumberCarsOwned range: 0 - 5 cars
YearlyIncome range: $25,435 - $139,115

=== CORRELATION RESULTS ===
Pearson Correlation Coefficient: 0.4773
Correlation strength: Moderate positive correlation
Sample size: 18361
T-statistic: 73.5962

=== SUMMARY STATISTICS ===

Income statistics by Number of Cars Owned:
                 count       mean       std     min     max
NumberCarsOwned                                            
0                 3927   56999.03  27411.65   25445  138889
1                 7455   63613.75  27224.25   25435  138851
2                 5085   87888.68  27575.04   50875  139109
3                 1880  1006