In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard

from google.colab import files
uploaded = files.upload()

data = pd.read_csv(next(iter(uploaded)))

selected_columns = [
    "Education", "Occupation", "Gender", "MaritalStatus",
    "HomeOwnerFlag", "NumberCarsOwned", "NumberChildrenAtHome",
    "TotalChildren", "YearlyIncome"
]
selected_data = data[selected_columns]

print("Missing Values:")
print(selected_data.isnull().sum())

scaler_minmax = MinMaxScaler()
selected_data['YearlyIncome_Normalized'] = scaler_minmax.fit_transform(
    selected_data[['YearlyIncome']]
)

selected_data['YearlyIncome_Binned'] = pd.cut(
    selected_data['YearlyIncome'], bins=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High']
)

scaler_standard = StandardScaler()
selected_data['YearlyIncome_Standardized'] = scaler_standard.fit_transform(
    selected_data[['YearlyIncome']]
)

encoded_data = pd.get_dummies(selected_data.drop(columns=['YearlyIncome_Binned']),
                               columns=["Education", "Occupation", "Gender", "MaritalStatus"])

row1 = encoded_data.iloc[0]
row2 = encoded_data.iloc[1]

cosine_sim = cosine_similarity([row1], [row2])

jaccard_sim = 1 - jaccard(row1.astype(bool), row2.astype(bool))

smc = (row1 == row2).mean()

print(f"Cosine Similarity: {cosine_sim[0][0]}")
print(f"Jaccard Similarity: {jaccard_sim}")
print(f"Simple Matching Coefficient: {smc}")

correlation = selected_data['NumberCarsOwned'].corr(selected_data['YearlyIncome'])
print(f"Correlation between NumberCarsOwned and YearlyIncome: {correlation}")

selected_data.head()
