In [2]:
import pandas as pd

data = pd.read_csv('AWCustomers.csv')

selected_attributes = ['BirthDate', 'Education', 'Occupation', 'Gender', 'MaritalStatus', 
                       'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 
                       'TotalChildren', 'YearlyIncome']

bike_buyers_data = data[selected_attributes].copy()

#Deriving 'Age' from 'BirthDate'
bike_buyers_data['Age'] = pd.to_datetime('today').year - pd.to_datetime(bike_buyers_data['BirthDate']).dt.year


bike_buyers_data.drop(columns=['BirthDate'], inplace=True)

print(bike_buyers_data.head())

#Discrete: Education, Occupation, Gender, MaritalStatus, HomeOwnerFlag, NumberCarsOwned, NumberChildrenAtHome, TotalChildren
#Continuous: Age, YearlyIncome
#Nominal: Occupation, Gender, MaritalStatus, HomeOwnerFlag
#Ordinal: Education
#Ratio: Age, NumberCarsOwned, NumberChildrenAtHome, TotalChildren, YearlyIncome

         Education      Occupation Gender MaritalStatus  HomeOwnerFlag  \
0        Bachelors        Clerical      M             M              1   
1  Partial College        Clerical      M             M              1   
2        Bachelors        Clerical      F             S              0   
3  Partial College  Skilled Manual      M             M              1   
4  Partial College  Skilled Manual      M             S              1   

   NumberCarsOwned  NumberChildrenAtHome  TotalChildren  YearlyIncome  Age  
0                3                     0              1         81916   37  
1                2                     1              2         81076   52  
2                3                     0              0         86387   39  
3                2                     1              2         61481   47  
4                1                     0              0         51804   49  


In [4]:
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, KBinsDiscretizer, OneHotEncoder

# (A) Handling NULL Values
# We are putting missing values for categorical columns with the mode and numerical columns with the median
for column in bike_buyers_data.columns:
    if bike_buyers_data[column].dtype == 'object':  # Categorical data
        bike_buyers_data[column].fillna(bike_buyers_data[column].mode()[0], inplace=True)
    else:  # Numerical data
        bike_buyers_data[column].fillna(bike_buyers_data[column].median(), inplace=True)
        
# (B) Normalization
# Normalize YearlyIncome and Age using MinMaxScaler i.e. scaling to [0,1]
scaler = MinMaxScaler()
bike_buyers_data[['YearlyIncome', 'Age']] = scaler.fit_transform(bike_buyers_data[['YearlyIncome', 'Age']])

# (C) Binning
# Discretizing Age and YearlyIncome into 3 bins each
binner = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
bike_buyers_data[['Age', 'YearlyIncome']] = binner.fit_transform(bike_buyers_data[['Age', 'YearlyIncome']])

# (D) Normalization
# Standardize NumberCarsOwned, NumberChildrenAtHome, TotalChildren
standard_scaler = StandardScaler()
bike_buyers_data[['NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren']] = standard_scaler.fit_transform(
    bike_buyers_data[['NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren']]
)

# (E) One-Hot Encoding
# Apply one-hot encoding to any remaining categorical variables
bike_buyers_data = pd.get_dummies(bike_buyers_data, drop_first=True)


print(bike_buyers_data.head())

# Save the preprocessed data for further use
bike_buyers_data.to_csv('final_preprocessed_bike_buyer_data.csv', index=False)
        

        

   HomeOwnerFlag  NumberCarsOwned  NumberChildrenAtHome  TotalChildren  \
0              1         1.892524             -0.594371       0.161342   
1              1         0.798389              1.163279       1.239753   
2              0         1.892524             -0.594371      -0.917069   
3              1         0.798389              1.163279       1.239753   
4              1        -0.295746             -0.594371      -0.917069   

   YearlyIncome  Age  Education_Graduate Degree  Education_High School  \
0           1.0  0.0                      False                  False   
1           1.0  1.0                      False                  False   
2           1.0  0.0                      False                  False   
3           0.0  0.0                      False                  False   
4           0.0  1.0                      False                  False   

   Education_Partial College  Education_Partial High School  \
0                      False                   



In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard
from scipy.stats import pearsonr


df = bike_buyers_data.copy()

numeric_cols = ['YearlyIncome', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren']


scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

df = pd.get_dummies(df, drop_first=True)

# Commute Distance is distance calculated in (a) (?)
commute_distance = df['NumberCarsOwned']  # Used NumberCarsOwned
yearly_income = df['YearlyIncome']

# two sample objects 
obj1 = df.iloc[0].values
obj2 = df.iloc[1].values

# (a) Calculate Similarity Measures
def simple_matching_similarity(x, y):
    return np.sum(x == y) / len(x)

simple_matching = simple_matching_similarity(obj1, obj2)

# Jaccard Similarity 
jaccard_sim = 1 - jaccard(obj1, obj2)

# Cosine Similarity
cosine_sim = cosine_similarity([obj1], [obj2])[0][0]

# (b) Calculate Correlation
corr, _ = pearsonr(commute_distance, yearly_income)

print(f"Simple Matching Similarity: {simple_matching}")
print(f"Jaccard Similarity: {jaccard_sim}")
print(f"Cosine Similarity: {cosine_sim}")
print(f"Correlation between NumberCarsOwned and Yearly Income: {corr}")


Simple Matching Similarity: 0.6875
Jaccard Similarity: 0.375
Cosine Similarity: 0.45939350805615
Correlation between NumberCarsOwned and Yearly Income: 0.446881402025338
