<a href="https://colab.research.google.com/github/nitrogoose/MACHINE_LEARNING/blob/main/ASS2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard
from scipy.stats import pearsonr

file_path = '/content/AdventureWorksCustomerData.csv'
data = pd.read_csv(file_path)

print("Dataset Head:\n", data.head())
print("\nDataset Info:\n", data.info())

selected_columns = [
    'Age', 'Gender', 'Education', 'Marital Status',
    'Yearly Income', 'Commute Distance', 'Bike Buyer'
]
df_selected = data[selected_columns]

print("\nSelected Features:\n", df_selected.head())

df_selected.fillna(df_selected.mean(numeric_only=True), inplace=True)
df_selected.fillna("Unknown", inplace=True)

scaler = MinMaxScaler()
numeric_columns = ['Age', 'Yearly Income']
df_selected[numeric_columns] = scaler.fit_transform(df_selected[numeric_columns])

df_selected['Age_binned'] = pd.cut(
    df_selected['Age'], bins=5, labels=['Very Young', 'Young', 'Middle-aged', 'Senior', 'Very Senior']
)

standard_scaler = StandardScaler()
df_selected['Yearly Income (Standardized)'] = standard_scaler.fit_transform(
    df_selected[['Yearly Income']]
)

encoder = OneHotEncoder(sparse=False, drop='first')
categorical_columns = ['Gender', 'Education', 'Marital Status', 'Commute Distance']
encoded = pd.DataFrame(
    encoder.fit_transform(df_selected[categorical_columns]),
    columns=encoder.get_feature_names_out(categorical_columns)
)
df_final = pd.concat([df_selected, encoded], axis=1)
df_final.drop(columns=categorical_columns, inplace=True)

print("\nTransformed Dataset:\n", df_final.head())

df_similarity = df_final.copy()
row_1 = df_similarity.iloc[0].values.reshape(1, -1)
row_2 = df_similarity.iloc[1].values.reshape(1, -1)
cosine_sim = cosine_similarity(row_1, row_2)[0][0]
jaccard_sim = 1 - jaccard(row_1, row_2)

correlation, p_value = pearsonr(data['Yearly Income'], data['Commute Distance'])

print("\nCosine Similarity:", cosine_sim)
print("\nJaccard Similarity:", jaccard_sim)
print("\nPearson Correlation (Yearly Income vs Commute Distance):", correlation)
