In [86]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [87]:
import pandas as pd
import numpy as np
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/MachineLearningPractice/Colab-Notebooks/FarmKit/foodfile/만개의레시피.csv')
df = df.dropna()
df.shape

(2343, 7)

In [88]:
df.reset_index(inplace=True)
df.drop(['index', 'id'], inplace=True, axis=1)
df.rename(columns={'설명':'description'}, inplace=True)
df.head()

Unnamed: 0,카테고리명,분류기준,식품명,description,ingre,recipe
0,양식,그라탕류,라따뚜이그라탕,"It is a gratin using ratatouille, a traditiona...","Tomato, eggplant, zucchini, penne, tomato sauc...","Prepare tomatoes, eggplant, and zucchini by cu..."
1,간식,토스트류,햄치즈토스트,Sliced ​​ham and cheese are placed on bread an...,"White bread, sliced ​​ham, cheddar cheese, can...",Prepare the cabbage and carrots by slicing the...
2,밑반찬,건어물류,마른미역자반,It is a side dish of dried seaweed fried in oi...,"Dried seaweed, perilla oil, cooking oil, sugar...",Heat perilla oil and cooking oil in a pan. Whe...
3,한식,찌개류,돼지고기된장찌개,It is a stew made by slicing pork and putting ...,"Pork leg meat, green onion, onion, zucchini, r...",Cut the onion and pumpkin into appropriate siz...
4,간식,빵류,고구마치즈빵,A sweet-tasting bread topped with boiled sweet...,"Steamed sweet potato, mayonnaise, honey, mozza...","Mix flour, milk, melted butter and salt in a b..."


In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix_description = tfidf.fit_transform(df['description'])
tfidf_matrix_ingre = tfidf.fit_transform(df['ingre'])
tfidf_matrix_recipe = tfidf.fit_transform(df['recipe'])

In [90]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

cosine_sim_description = cosine_similarity(tfidf_matrix_description, tfidf_matrix_description)
cosine_sim_ingre = cosine_similarity(tfidf_matrix_ingre, tfidf_matrix_ingre)
cosine_sim_recipe = cosine_similarity(tfidf_matrix_recipe, tfidf_matrix_recipe)
distance_sim_description = euclidean_distances(tfidf_matrix_description, tfidf_matrix_description)
distance_sim_ingre = euclidean_distances(tfidf_matrix_ingre, tfidf_matrix_ingre)
distance_sim_recipe = euclidean_distances(tfidf_matrix_recipe, tfidf_matrix_recipe)

In [91]:
indices = pd.Series(df.index, index=df['식품명']).drop_duplicates()
# 음식명을 입력받으면 코사인 유사도를 통해서 가장 유사도가 높은 상위 10개의 음식 목록 반환
def get_recommendations(title, sim, reverse):
  
  # 음식명을 통해서 전체 데이터 기준 그 음식의 index 값을 얻기
  idx = indices[title]

  # 유사도 매트릭스 에서 idx 에 해당하는 데이터를 (idx, 유사도) 형태로 얻기
  sim_scores = list(enumerate(sim[idx]))

  # 유사도 기준 정렬
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=reverse)    # 코사인 유사도는 True, 유클리디안 유사도는 False를 넣어줘야함

  # 자기 자신을 제외한 10개의 추천 음식을 슬라이싱
  sim_scores = sim_scores[1:11]

  # 추천 음식목록 10개의 인덱스 정보 추출
  movie_indices = [i[0] for i in sim_scores]

  # 인덱스 정보를 통해 음식명 추출
  return list(df['식품명'].iloc[movie_indices])

In [92]:
def to_dataframe (cos_description, euc_description, cos_ingre, euc_ingre, cos_recipe, euc_recipe):
    df = pd.DataFrame()
    df['cos description'] = cos_description
    df['euc description'] = euc_description
    df['cos ingre'] = cos_ingre
    df['euc ingre'] = euc_ingre
    df['cos recipe'] = cos_recipe
    df['euc recipe'] = euc_recipe
    return df

In [93]:
to_dataframe(get_recommendations('고구마치즈빵', cosine_sim_description, True), get_recommendations('고구마치즈빵', distance_sim_description, False), get_recommendations('고구마치즈빵', cosine_sim_ingre, True), get_recommendations('고구마치즈빵', distance_sim_ingre, False), get_recommendations('고구마치즈빵', cosine_sim_recipe, True), get_recommendations('고구마치즈빵', distance_sim_recipe, False))

Unnamed: 0,cos description,euc description,cos ingre,euc ingre,cos recipe,euc recipe
0,더블치즈토스트,부각,아코디언꿀고구마,아코디언꿀고구마,제티체크쿠키&제티설기,제티체크쿠키&제티설기
1,치즈마늘빵,더블치즈토스트,로즈연유브레드,로즈연유브레드,감자피자,감자피자
2,베이컨치즈토스트,치즈마늘빵,콘치즈와플,콘치즈와플,소금빵,소금빵
3,프로틴빵,베이컨치즈토스트,하이토스트,하이토스트,공갈빵,공갈빵
4,구름빵,프로틴빵,소금빵,소금빵,고구마베이컨말이,고구마베이컨말이
5,리얼고구마빵&리얼 감자빵,구름빵,모닝빵,모닝빵,참깨크래커,참깨크래커
6,하이토스트,리얼고구마빵&리얼 감자빵,더블치즈토스트,더블치즈토스트,시나몬롤,시나몬롤
7,크림통식빵,하이토스트,떠먹는 고구마케이크,떠먹는 고구마케이크,모닝빵,모닝빵
8,고구마치즈볼,크림통식빵,고구마치즈볼,고구마치즈볼,감자찹쌀구이,감자찹쌀구이
9,몬테크리스토 샌드위치,고구마치즈볼,땅콩러스크,땅콩러스크,사과파이,사과파이


In [94]:
!pip install transformers
import torch
from transformers import AutoModel, AutoTokenizer

MODEL_NAME = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [95]:
# [CLS] token을 얻기 위한 함수
def get_cls_token(sent_A):
    model.eval()
    tokenized_sent = tokenizer(
            sent_A,
            return_tensors="pt",
            truncation=True,
            add_special_tokens=True,
            max_length=128
    )
    with torch.no_grad():# 그라디엔트 계산 비활성화
        outputs = model(    # **tokenized_sent
            input_ids=tokenized_sent['input_ids'],
            attention_mask=tokenized_sent['attention_mask'],
            token_type_ids=tokenized_sent['token_type_ids']
            )
    logits = outputs.last_hidden_state[:,0,:].detach().cpu().numpy()
    return logits

In [96]:
def cls_token_array(df, column) :
  dataset_cls_hidden = []
  for i in range(df.shape[0]):
      q_cls = get_cls_token(str(df.loc[i, column])) #column 예: '음식설명', '영문'
      dataset_cls_hidden.append(q_cls)
  dataset_cls_hidden = np.array(dataset_cls_hidden).squeeze(axis=1)
  #print(dataset_cls_hidden)   
  print(dataset_cls_hidden.shape) # column에 대한 [CLS] 토큰 벡터
  return dataset_cls_hidden

column = 'description'
dataset_cls_hidden = cls_token_array(df, column)

(2343, 768)


In [97]:
def bert_food_recommendation(df, dataset_cls_hidden, column, food, similarity): 
  
  index = df[df['식품명'] == food].index[0]
  food_cls_hidden = get_cls_token(str(df.loc[index, column]))

  sim = similarity(food_cls_hidden, dataset_cls_hidden)
  #print(cos_sim)
  if similarity == cosine_similarity:
    top = np.argpartition(sim, -10)[:, -10:] # 상위 10개 인덱스 추출
  else :
    top = np.argpartition(sim, 10)[:, :10]

  return list(df.iloc[top[0]]['식품명'])

In [100]:
def to_dataframe2 (cos_description, euc_description, cos_ingre, euc_ingre, cos_recipe, euc_recipe):
    df = pd.DataFrame()
    df['cos description'] = cos_description
    df['euc description'] = euc_description
    df['cos ingre'] = cos_ingre
    df['euc ingre'] = euc_ingre
    df['cos recipe'] = cos_recipe
    df['euc recipe'] = euc_recipe
    return df

In [101]:
to_dataframe2(bert_food_recommendation(df, dataset_cls_hidden, 'description', '고구마치즈빵', cosine_similarity), bert_food_recommendation(df, dataset_cls_hidden, 'description', '고구마치즈빵', euclidean_distances), bert_food_recommendation(df, dataset_cls_hidden, 'ingre', '고구마치즈빵', cosine_similarity), bert_food_recommendation(df, dataset_cls_hidden, 'ingre', '고구마치즈빵', euclidean_distances), bert_food_recommendation(df, dataset_cls_hidden, 'recipe', '고구마치즈빵', cosine_similarity), bert_food_recommendation(df, dataset_cls_hidden, 'recipe', '고구마치즈빵', euclidean_distances))

Unnamed: 0,cos description,euc description,cos ingre,euc ingre,cos recipe,euc recipe
0,감드위치&포테딕트,고구마치즈빵,진미채무침,관자샐러드,군밤,구겔호프
1,레몬머랭파이,크로크마담,치즈베이컨그라탕,누들샐러드,된장가지구이,명란감자채볶음
2,갸또쇼콜라,옥수수케이크,오징어볶음&소면,땅콩러스크,쉬폰파운드케이크,달걀버거
3,달콤하지롤&레몬너한테피치나,달콤하지롤&레몬너한테피치나,불낙볶음,오징어볶음&소면,구겔호프,찹쌀치즈어묵
4,리스샐러드피자,갸또쇼콜라,관자샐러드,연어치즈말이,봉골레수제비,스파이더맨고추장참치볶음밥
5,꿀땅콩스낵,리스샐러드피자,토마토처트니,불낙볶음,스파이더맨고추장참치볶음밥,봉골레수제비
6,옥수수케이크,감귤샌드,연어치즈말이,베이컨감자구이,명란감자채볶음,쉬폰파운드케이크
7,카프레제샐러드,꿀땅콩스낵,만능볶음밥,만능볶음밥,찹쌀치즈어묵,구운명란치즈주먹밥
8,고구마치즈빵,감드위치&포테딕트,땅콩러스크,막창국수&달걀찜,달걀버거,무스비
9,크로크마담,레몬머랭파이,누들샐러드,구운가지토마토샐러드,구운명란치즈주먹밥,밤양갱
