In [13]:
import numpy as np
import pandas as pd

# part 1

df = pd.read_csv("data.csv")

# View first few rows
df.head()

# Select important attributes only
selected_features = ['BirthDate', 'Education', 'Occupation', 'Gender', 
                     'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned', 
                     'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome']

df_selected = df[selected_features]
df_selected.head()

# Drop BirthDate 
df_selected = df_selected.drop('BirthDate', axis=1)

df_selected.to_csv("Selected_data.csv", index=False)


# part 2





In [1]:
# Part II


import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, KBinsDiscretizer, OneHotEncoder

# Load your selected data from Part I
df = pd.read_csv("Selected_data.csv")

# Identify numeric and categorical attributes
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric columns:", numeric_cols)
print("Categorical columns:", cat_cols)


Numeric columns: ['HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome']
Categorical columns: ['Education', 'Occupation', 'Gender', 'MaritalStatus']


In [2]:
# ques2 a

# (a) Handling Null Values
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

df_numeric_filled = pd.DataFrame(num_imputer.fit_transform(df[numeric_cols]), columns=numeric_cols)
df_categorical_filled = pd.DataFrame(cat_imputer.fit_transform(df[cat_cols]), columns=cat_cols)

# Combine results
df_a = pd.concat([df_numeric_filled, df_categorical_filled], axis=1)
print("\n(a) Null values handled successfully.")



(a) Null values handled successfully.


In [3]:
# (b) Normalization (Min-Max Scaling)
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df_a[numeric_cols]), columns=numeric_cols)

# Keep categorical as is
df_b = pd.concat([df_normalized, df_a[cat_cols]], axis=1)
print("\n(b) Normalization applied (0–1 scale).")



(b) Normalization applied (0–1 scale).


In [4]:
# (c) Discretization (Binning) on Continuous Attributes
df_c = df_b.copy()

# Example: Discretize YearlyIncome into 5 bins
if "YearlyIncome" in df_c.columns:
    df_c["YearlyIncome_Bin"] = pd.qcut(df_c["YearlyIncome"], q=5, labels=["Very Low","Low","Medium","High","Very High"])

print("\n(c) Discretization applied on continuous attributes (e.g., YearlyIncome).")



(c) Discretization applied on continuous attributes (e.g., YearlyIncome).


In [5]:
# (d) Standardization (Z-score)
standard_scaler = StandardScaler()
df_standardized = pd.DataFrame(standard_scaler.fit_transform(df_a[numeric_cols]), columns=numeric_cols)

# Keep categorical same
df_d = pd.concat([df_standardized, df_a[cat_cols]], axis=1)
print("\n(d) Standardization applied (mean=0, std=1).")



(d) Standardization applied (mean=0, std=1).


In [7]:
# (e) Binarization (One-Hot Encoding)
encoder = OneHotEncoder(sparse_output=False, drop=None, handle_unknown='ignore')
encoded_array = encoder.fit_transform(df_a[cat_cols])
encoded_cols = encoder.get_feature_names_out(cat_cols)

df_encoded = pd.DataFrame(encoded_array, columns=encoded_cols)
df_e = pd.concat([df_a[numeric_cols], df_encoded], axis=1)

print("\n(e) Binarization done (One-Hot Encoding applied).")




(e) Binarization done (One-Hot Encoding applied).


In [16]:
# Part III:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

# --- Load preprocessed data ---
df = pd.read_csv("Selected_data.csv")  # your selected features

# --- Identify numeric and categorical columns ---
numeric_cols = ['NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome']
categorical_cols = ['Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag']

# --- Standardize numeric columns ---
scaler = StandardScaler()
df_numeric = pd.DataFrame(scaler.fit_transform(df[numeric_cols]), columns=numeric_cols)

# --- One-hot encode categorical columns ---
encoder = OneHotEncoder(sparse_output=False)  # updated parameter
df_categorical = pd.DataFrame(encoder.fit_transform(df[categorical_cols]),
                              columns=encoder.get_feature_names_out(categorical_cols))

# --- Combine numeric and categorical transformed data ---
df_transformed = pd.concat([df_numeric, df_categorical], axis=1)

# --- Part III (a): Similarity between two objects ---
obj1 = df_transformed.iloc[0].values
obj2 = df_transformed.iloc[1].values

# Simple Matching Similarity
simple_matching = np.sum(obj1 == obj2) / len(obj1)

# Jaccard Similarity (binary attributes only)
obj1_bin = df_categorical.iloc[0].values
obj2_bin = df_categorical.iloc[1].values
jaccard_similarity = np.sum(np.logical_and(obj1_bin, obj2_bin)) / np.sum(np.logical_or(obj1_bin, obj2_bin))

# Cosine Similarity (all attributes)
cos_sim = cosine_similarity([obj1], [obj2])[0][0]

print("Simple Matching Similarity:", simple_matching)
print("Jaccard Similarity:", jaccard_similarity)
print("Cosine Similarity:", cos_sim)

# --- Part III (b): Correlation between two features ---
if 'CommuteDistance' in df.columns:
    corr, _ = pearsonr(df['CommuteDistance'], df['YearlyIncome'])
    print("Pearson Correlation between CommuteDistance and YearlyIncome:", corr)
else:
    print("CommuteDistance column not found in dataset")



Simple Matching Similarity: 0.7
Jaccard Similarity: 0.6666666666666666
Cosine Similarity: 0.5781102513793457
CommuteDistance column not found in dataset
