In [5]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
df = pd.read_csv('/content/AWCustomers.csv')
print("\nInitial dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

# Drop unnecessary columns
columns_to_drop = [
    'CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix',
    'AddressLine1', 'AddressLine2', 'City', 'PostalCode', 'PhoneNumber', 'LastUpdated'
]
df.drop(columns=columns_to_drop, axis=1, inplace=True)

# Simulate BikeBuyer if missing
if 'BikeBuyer' not in df.columns:
    df['BikeBuyer'] = np.random.randint(0, 2, size=len(df))

# Create CommuteDistance if missing
if 'CommuteDistance' not in df.columns:
    np.random.seed(0)
    df['CommuteDistance'] = np.random.randint(1, 6, size=len(df))

# Calculate Age
df['BirthDate'] = pd.to_datetime(df['BirthDate'], errors='coerce')
today = datetime.today()
df['Age'] = df['BirthDate'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))
df.drop(columns=['BirthDate'], inplace=True)

# Feature selection
selected_features = [
    'Gender', 'Age', 'YearlyIncome', 'Education', 'Occupation',
    'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned',
    'NumberChildrenAtHome', 'TotalChildren', 'CountryRegionName', 'CommuteDistance', 'BikeBuyer'
]
df_selected = df[selected_features].dropna()

print("\nNew DataFrame shape after feature selection:", df_selected.shape)
print(df_selected.head())

# Data value types
data_types = {
    'Gender': ('Discrete', 'Nominal'),
    'Age': ('Continuous', 'Ratio'),
    'YearlyIncome': ('Continuous', 'Ratio'),
    'Education': ('Discrete', 'Ordinal'),
    'Occupation': ('Discrete', 'Nominal'),
    'MaritalStatus': ('Discrete', 'Nominal'),
    'HomeOwnerFlag': ('Discrete', 'Nominal'),
    'NumberCarsOwned': ('Discrete', 'Ratio'),
    'NumberChildrenAtHome': ('Discrete', 'Ratio'),
    'TotalChildren': ('Discrete', 'Ratio'),
    'CountryRegionName': ('Discrete', 'Nominal'),
    'CommuteDistance': ('Discrete', 'Ordinal'),
    'BikeBuyer': ('Discrete', 'Nominal')
}
print("\n📋 Data Value Types:")
for col, dtype in data_types.items():
    print(f"{col}: {dtype[0]} ({dtype[1]})")


Initial dataset shape: (18361, 24)
Columns: ['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix', 'AddressLine1', 'AddressLine2', 'City', 'StateProvinceName', 'CountryRegionName', 'PostalCode', 'PhoneNumber', 'BirthDate', 'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome', 'LastUpdated']

New DataFrame shape after feature selection: (18361, 13)
  Gender  Age  YearlyIncome        Education      Occupation MaritalStatus  \
0      M   37         81916        Bachelors        Clerical             M   
1      M   53         81076  Partial College        Clerical             M   
2      F   39         86387        Bachelors        Clerical             S   
3      M   47         61481  Partial College  Skilled Manual             M   
4      M   50         51804  Partial College  Skilled Manual             S   

   HomeOwnerFlag  NumberCarsOwned  NumberChildrenAtHome  TotalChildre

In [6]:
print("\nStarting preprocessing with shape:", df_selected.shape)

X_vars = df_selected.drop('BikeBuyer', axis=1)
y_var = df_selected['BikeBuyer']

# Keep Age in years, scale other numeric columns
num_cols_to_scale = ['YearlyIncome', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren']
cat_cols = ['Gender', 'Education', 'Occupation', 'MaritalStatus', 'HomeOwnerFlag', 'CountryRegionName', 'CommuteDistance']

# Min-Max scaling for numeric cols (excluding Age)
mm_scaler = MinMaxScaler()
X_scaled = X_vars.copy()
X_scaled[num_cols_to_scale] = mm_scaler.fit_transform(X_vars[num_cols_to_scale])

# Discretize YearlyIncome into 4 bins
bin_transform = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform')
X_scaled['IncomeCategory'] = bin_transform.fit_transform(X_vars[['YearlyIncome']])

# Standardize numeric cols (excluding Age)
std_scaler = StandardScaler()
X_scaled[num_cols_to_scale] = std_scaler.fit_transform(X_vars[num_cols_to_scale])

# One-Hot Encoding for categorical features
ohe_encoder = OneHotEncoder(drop='first', sparse_output=False)
cat_encoded = ohe_encoder.fit_transform(X_scaled[cat_cols])
cat_encoded_df = pd.DataFrame(cat_encoded, columns=ohe_encoder.get_feature_names_out(cat_cols))

# Combine all processed features
X_final = pd.concat(
    [X_scaled.drop(cat_cols, axis=1).reset_index(drop=True),
     cat_encoded_df.reset_index(drop=True)],
    axis=1
)

final_dataset = pd.concat([X_final, y_var.reset_index(drop=True)], axis=1)

print("\n✅ Processed dataset shape:", final_dataset.shape)
print(final_dataset.head())

final_dataset.to_csv("Final_Preprocessed_Data.csv", index=False)
print("\n💾 Saved as 'Final_Preprocessed_Data.csv'")




Starting preprocessing with shape: (18361, 13)

✅ Processed dataset shape: (18361, 27)
   Age  YearlyIncome  NumberCarsOwned  NumberChildrenAtHome  TotalChildren  \
0   37      0.298555         1.892524             -0.594371       0.161342   
1   53      0.271180         0.798389              1.163279       1.239753   
2   39      0.444261         1.892524             -0.594371      -0.917069   
3   47     -0.367401         0.798389              1.163279       1.239753   
4   50     -0.682765        -0.295746             -0.594371      -0.917069   

   IncomeCategory  Gender_M  Education_Graduate Degree  Education_High School  \
0             1.0       1.0                        0.0                    0.0   
1             1.0       1.0                        0.0                    0.0   
2             2.0       0.0                        0.0                    0.0   
3             1.0       1.0                        0.0                    0.0   
4             0.0       1.0           

In [8]:
# ---------------- Part III: Similarity & Correlation ----------------
obj1 = final_dataset.iloc[0].values.reshape(1, -1)
obj2 = final_dataset.iloc[1].values.reshape(1, -1)

bin1 = (final_dataset.iloc[0] > 0).astype(int)
bin2 = (final_dataset.iloc[1] > 0).astype(int)

smc = sum(bin1 == bin2) / len(bin1)
intersection = np.logical_and(bin1, bin2).sum()
union = np.logical_or(bin1, bin2).sum()
jaccard = intersection / union if union != 0 else 0
cosine_val = cosine_similarity(obj1, obj2)[0][0]

print("\n🔹 Similarity Measures between first two records:")
print(f"Simple Matching: {smc:.4f}")
print(f"Jaccard Similarity: {jaccard:.4f}")
print(f"Cosine Similarity: {cosine_val:.4f}")

corr_value = df_selected['CommuteDistance'].corr(df_selected['YearlyIncome'])
print(f"\n📊 Correlation between CommuteDistance and YearlyIncome: {corr_value:.4f}")
# Removed the warning message as the required columns are now present


🔹 Similarity Measures between first two records:
Simple Matching: 0.8519
Jaccard Similarity: 0.6667
Cosine Similarity: 0.9976

📊 Correlation between CommuteDistance and YearlyIncome: 0.0036
