# Dataset Description - User Info

'pk1': Date data (2310, 2401, 2404, 2407)

'pk2': Gender data (1 for male, 0 for female)

'pk3': Age data (1.0 - 10.0 for early 20s to early 60s)

'pk4': County data (categorical)

'pk5': City data (categorical)

# 'ca': Overall
'ca01': average expenses

# 'cb': Consumer Products

'cb01': average department store expenses

'cb02': average supermarket expenses

'cb03': average convenience store expenses

'cb04': average e-commerce expenses

'cb05': average duty-free expenses

'cb06': average clothing expenses

'cb07': average cosmetics expenses

'cb08': average electronics expenses

'cb09': average daily equipment expenses


# 'cc': Food
'cc01': average restaurant expenses

'cc02': average high-end cafe expenses

'cc03': average low-end coffeehouse expenses

'cc04': average proprietary delivery expenses


# 'cd': Daily Services

'cd01': average long-distance bus expenses

'cd02': average taxi expenses

'cd03': average flight expenses

'cd04': average train expenses

'cd05': average cruise ship expenses

'cd06': average phone expenses

'cd07': average bookstore expenses

'cd08': average glasses expenses

'cd09': average laundry expenses

'cd10': average beauty salon expenses

'cd11': average rental car expenses

'cd12': average gas station expenses

'cd13': average LPG gas expenses

'cd14': average electric car expenses


# 'ce': Professional Services
'ce01': average private education expenses

'ce02': average hospital expenses

'ce03': average dentist expenses

'ce04': average pharmacy expenses

'ce05': average wedding hall expenses

'ce06': average funeral expenses

'ce07': average life insurance expenses

'ce08': average donation expenses


# 'cf': Culture and Leisure
'cf01': average premium hotel expenses

'cf02': average hotel expenses

'cf03': average travel agency expenses

'cf04': average fitness center expenses

'cf05': average swimming pool expenses

'cf06': average billiards club expenses

'cf07': average outdoor golf course expenses

'cf08': average screen golf expenses

'cf09': average tennis court expenses

'cf10': average bowling parlor expenses

'cf11': average amusement park expenses

'cf12': average bike expenses

'cf13': average sports equipment expenses

'cf14': average artwork expenses

'cf15': average florist expenses

'cf16': average aquarium expenses

'cf17': average pc cafe expenses


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [None]:
raw_data = pd.read_csv('./card_data_cleaned.csv')

raw_data.head()

In [None]:
raw_data.dtypes[:6]

In [None]:
labEn = LabelEncoder()

raw_data['gender'] = labEn.fit_transform(raw_data['gender'])
raw_data['region_county'] = labEn.fit_transform(raw_data['region_county'])
raw_data['region_city'] = labEn.fit_transform(raw_data['region_city'])

raw_data.dtypes[:6]

In [None]:
raw_data_corr = raw_data.corr()
features = raw_data.columns.tolist()[:6]
labels = raw_data.columns.tolist()[6:]
corrs_target = abs(raw_data_corr[labels][:6])
rel_features = corrs_target[corrs_target>0.1]

In [None]:
print(corrs_target)

In [None]:
print(corrs_target.idxmax())

In [None]:
# print only the features with the highest correlation with the target and their values

for label in labels:
    print(raw_data_corr[label][:6].nlargest(2))
    print(corrs_target[label].nlargest(2))

In [None]:
corrs_target[corrs_target > 0.5]

In [None]:
# print only features with correlation over 0.5 with target labels
best_features_df = corrs_target[corrs_target > 0.5]

# Delete columns with only NaN values

for col in best_features_df.columns.tolist():
    if best_features_df[col].isnull().all():
        best_features_df = best_features_df.drop(columns=[col])

In [None]:
new_cols = ["총소비", "편의점", "요식", "고가 커피", "저가 커피", "배달앱", "택시", "주유소", "약국", "OTT"]
cols = best_features_df.columns.tolist()

cols_dict = dict(zip(cols, new_cols))
best_features_df.rename(columns = cols_dict, inplace = True)

In [None]:
best_features_df

## 나이
cb03_tot: 편의점

cc02_tot: 대형 고가커피

cc03_tot: 소형 커피

cc04_tot: 배달 앱

cd02_tot: 택시

ce04_tot: 약국

cf18_tot: OTT

## 성별
cbo3_tot: 편의점

cc01_tot: 식당

cd12_tot: 주유소

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Optional: Set seaborn style for better aesthetics
sns.set(style="whitegrid")

In [None]:
card_data_truncated = raw_data[['age', 'gender', 'cb03_tot', 'cc01_tot','cc02_tot','cc03_tot','cc04_tot','cd02_tot','cd12_tot','ce04_tot','cf18_tot']]

In [None]:
card_data_truncated.head()

In [None]:
# Encode Gender: 0 -> Female, 1 -> Male
card_data_truncated['Gender'] = card_data_truncated['gender'].map({0: 'Female', 1: 'Male'})

# Define age bins and labels
age_bins = [-0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5]
age_labels = ['Early 20s', 'Late 20s', 'Early 30s', 'Late 30s', 'Early 40s','Late 40s', 'Early 50s','Late 50s', 'Early 60s','Late 60s+']

# Encode Age Groups based on pk3
card_data_truncated['Age_Group'] = pd.cut(card_data_truncated['age'], bins=age_bins, labels=age_labels, right=False)

# Display the updated DataFrame
print(card_data_truncated[['gender', 'Gender', 'age', 'Age_Group']])

# Correlation Matrix Visualization
corr_features = ['gender', 'age', 'cb03_tot', 'cc02_tot', 'cc03_tot', 'cc04_tot', 
                'cd02_tot', 'ce04_tot', 'cf18_tot', 'cc01_tot', 'cd12_tot']
corr_matrix = card_data_truncated[corr_features].corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Box Plots for Age Groups vs Spending Categories
age_spend_vars = ['cb03_tot', 'cc02_tot', 'cc03_tot', 'cc04_tot', 'cd02_tot', 'ce04_tot', 'cf18_tot']
for var in age_spend_vars:
    plt.figure(figsize=(8,6))
    sns.boxplot(x='Age_Group', y=var, data=card_data_truncated, palette='Set3')
    plt.title(f'Age Group vs {var}')
    plt.xlabel('Age Group')
    plt.ylabel('Total Percentage of Expenses')
    plt.show()

# Box Plots for Gender vs Spending Categories
gender_spend_vars = ['cb03_tot', 'cc01_tot', 'cd12_tot']
for var in gender_spend_vars:
    plt.figure(figsize=(6,4))
    sns.boxplot(x='Gender', y=var, data=card_data_truncated, palette='Set2')
    plt.title(f'Gender vs {var}')
    plt.xlabel('Gender')
    plt.ylabel('Total Percentage of Expenses')
    plt.show()

# Encode categorical features for clustering
le_gender = LabelEncoder()
card_data_truncated['Gender_Encoded'] = le_gender.fit_transform(card_data_truncated['Gender'])

le_age = LabelEncoder()
card_data_truncated['Age_Group_Encoded'] = le_age.fit_transform(card_data_truncated['Age_Group'])

# Select features for clustering
clustering_features = ['Gender_Encoded', 'Age_Group_Encoded']
X_cluster = card_data_truncated[clustering_features]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)

# Determine optimal number of clusters using Elbow Method
inertia = []
K = range(1,10)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8,5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()

# Choose k (e.g., 3 based on Elbow Method)
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
card_data_truncated['Cluster'] = clusters

# Scatter Plot of Age vs Gender colored by Cluster
plt.figure(figsize=(8,6))
sns.scatterplot(x='age', y='Gender_Encoded', hue='Cluster', data=card_data_truncated, palette='Set1', s=100)
plt.title('Clustering Based on Gender and Age')
plt.xlabel('Age')
plt.ylabel('Gender (0: Female, 1: Male)')
plt.legend(title='Cluster')
plt.show()

# Pair Plot to Explore Cluster Characteristics
sns.pairplot(card_data_truncated, vars=['age', 'Gender_Encoded'], hue='Cluster', palette='Set2')
plt.suptitle('Pair Plot of Age and Gender Colored by Cluster', y=1.02)
plt.show()

# Display cluster counts
print(card_data_truncated['Cluster'].value_counts())