<a href="https://colab.research.google.com/github/nabomhalang/shinhan/blob/main/Collaborative%20Filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
# !pip install numpy
# !pip install pandas
# !pip install scikit-learn
!pip install tensorflow
!pip install keras



# Collaborative Filtering(협업 필터링)
- Item-Based Filtering
  - 특정 아이템(Item)을 선택
      - 예시 : 함께 구매한 경우가 많은 상품 - '사이다'와 '콜라'
      1. 특정 Item을 좋아한 사용자들을 찾음
      2. 그 사용자들이 공통적으로 좋아했던 다른 Item을 찾음
      3. 해석 : 이 아이템을 좋아한 사용자는, "B"영화도 좋아했습니다  

In [3]:
import pandas as pd
import random

In [4]:
# 고객 데이터 생성
customers = []
for i in range(300):
    customer = {
        "CustomerID": i + 1,
        "Name": f"Customer_{i + 1}",
        "Age": random.randint(20, 70),
        "Occupation": random.choice(["Employed", "Self-Employed", "Unemployed", "Student"]),
        "Income": random.randint(20000, 200000),
        "MaritalStatus": random.choice(["Single", "Married"]),
    }
    # 0에서 6개 사이의 ProductID 할당
    num_products = random.randint(0, 6)
    product_ids = random.sample(range(1, 501), num_products)
    customer["ProductIDs"] = product_ids
    customers.append(customer)

customers_df = pd.DataFrame(customers)

# 금융상품 데이터 생성
products = []
for i in range(500):
    product = {
        "ProductID": i + 1,
        "ProductName": f"Product_{i + 1}",
        "Category": random.choice(["Savings", "Credit", "Investment", "Insurance", "Mortgage"]),
        "Description": f"Description of Product {i + 1}"
    }
    products.append(product)

products_df = pd.DataFrame(products)

In [5]:
customers_df.head(5)

Unnamed: 0,CustomerID,Name,Age,Occupation,Income,MaritalStatus,ProductIDs
0,1,Customer_1,62,Student,100888,Single,"[24, 244, 15, 226, 269]"
1,2,Customer_2,38,Student,113758,Single,"[410, 282]"
2,3,Customer_3,65,Self-Employed,120214,Married,"[345, 254, 229]"
3,4,Customer_4,64,Student,165057,Single,"[455, 12, 265, 411, 474, 267]"
4,5,Customer_5,56,Student,72841,Married,"[206, 269]"


In [6]:
products_df.head(5)

Unnamed: 0,ProductID,ProductName,Category,Description
0,1,Product_1,Credit,Description of Product 1
1,2,Product_2,Insurance,Description of Product 2
2,3,Product_3,Credit,Description of Product 3
3,4,Product_4,Credit,Description of Product 4
4,5,Product_5,Investment,Description of Product 5


In [7]:
# 고객 데이터에서 ProductIDs를 개별 행으로 전개
expanded_rows = []
for _, row in customers_df.iterrows():
    for product_id in row['ProductIDs']:
        expanded_row = row.to_dict()
        expanded_row['ProductID'] = product_id
        expanded_rows.append(expanded_row)

expanded_customers_df = pd.DataFrame(expanded_rows)

# 고객-상품 상호작용 데이터 생성 (명시적으로 복사본을 생성)
interaction_data = expanded_customers_df[['CustomerID', 'ProductID']].copy()
interaction_data['Interaction'] = 1

# 피벗 테이블 생성 (ProductID를 인덱스로 사용)
pivot_table = interaction_data.pivot_table(index='ProductID', columns='CustomerID', values='Interaction', fill_value=0)

pivot_table.head()

CustomerID,1,2,3,4,5,6,8,9,10,11,...,290,292,293,294,295,296,297,298,299,300
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

item_based_collabor = cosine_similarity(pivot_table)
item_based_collabor

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [9]:
item_based_collabor = pd.DataFrame(data = item_based_collabor, index = pivot_table.index, columns = pivot_table.index)
item_based_collabor

ProductID,2,3,5,6,7,8,10,12,13,14,...,490,492,493,494,495,496,497,498,499,500
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.316228,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.316228,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.288675,...,0.0,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0
498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0
499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# 특정 금융상품과 다른 모든 금융상품들 간의 유사성을 나타냅니다.
1. 피벗 테이블 생성
2. 코사인 유사도 계산
3. 특정 상품과 가장 유사한 상품 찾기
4. 상세 정보 출력

In [27]:
def get_item_based_collabor_details(product_id, item_similarity_df, products_df, top_n=3):
    """
    :param product_id: 상품 ID
    :param item_similarity_df: 아이템 기반 유사도 DataFrame
    :param products_df: 상품 정보가 담긴 DataFrame
    :param top_n: 반환할 상위 상품의 수
    :return: 상품 ID, 유사도 및 상세 정보를 포함한 DataFrame
    """
    if product_id not in item_similarity_df.index:
        return f"ProductID {product_id}는 유사한 상품이 존재하지 않습니다."

    similar_items = item_similarity_df[product_id].sort_values(ascending=False)[1:top_n+1]
    if similar_items.empty:
        return f"ProductID {product_id}는 유사한 상품이 존재하지 않습니다."

    similar_items_percentage = (similar_items * 100).map("{:.2f}%".format)

    details = []
    for idx in similar_items_percentage.index:
        product_details = products_df[products_df['ProductID'] == idx]
        if not product_details.empty:
            details.append({
                "ProductID": idx,
                "Similarity": similar_items_percentage[idx],
                "ProductName": product_details.iloc[0]['ProductName'],
                "Category": product_details.iloc[0]['Category'],
                "Description": product_details.iloc[0]['Description']
            })

    return pd.DataFrame(details)

In [29]:
similar_products_details = get_item_based_collabor_details(240, item_based_collabor, products_df)
similar_products_details

Unnamed: 0,ProductID,Similarity,ProductName,Category,Description
0,331,57.74%,Product_331,Mortgage,Description of Product 331
1,51,35.36%,Product_51,Investment,Description of Product 51
2,29,35.36%,Product_29,Investment,Description of Product 29
