In [155]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize  
from nltk.tokenize import WordPunctTokenizer 
from tensorflow.keras.preprocessing.text import text_to_word_sequence 
from math import log

In [3]:
from nltk.corpus import wordnet # 프린스턴 대학교에서 만든 어휘, 반의어들을 추출하는 에드온이다.

In [7]:
synsets = wordnet.synsets("plan")
synsets # n은 명사 v는 동사를 말한다.
# 단어에 대한 사전적 의미를 확인할 수 있다. 
# plan.n.01이 궁금하다고 하자. 

[Synset('plan.n.01'),
 Synset('design.n.02'),
 Synset('plan.n.03'),
 Synset('plan.v.01'),
 Synset('plan.v.02'),
 Synset('plan.v.03'),
 Synset('design.v.04')]

In [12]:
plan = wordnet.synset("plan.n.01")
plan.definition() 
# '수행해야 할 일련의 단계 또는 달성해야 할 목표'

'a series of steps to be carried out or goals to be accomplished'

In [13]:
planv = wordnet.synset("plan.v.01")
planv.definition() 
# '어떤 행동을 수행할 의지와 의도'

'have the will and intention to carry out some action'

In [14]:
planv.definition() # 유사성 검사이다.

'have the will and intention to carry out some action'

In [16]:
wordnet.synsets('boy')
wordnet.synsets('man')

[Synset('man.n.01'),
 Synset('serviceman.n.01'),
 Synset('man.n.03'),
 Synset('homo.n.02'),
 Synset('man.n.05'),
 Synset('man.n.06'),
 Synset('valet.n.01'),
 Synset('man.n.08'),
 Synset('man.n.09'),
 Synset('man.n.10'),
 Synset('world.n.08'),
 Synset('man.v.01'),
 Synset('man.v.02')]

In [17]:
boy = wordnet.synset('boy.n.02')
man = wordnet.synset('man.n.01')

In [18]:
boy.path_similarity(man)# 둘의 유사도는 0.5이다. 
# 인공지능 연구분야중 단어와 단어 사이의 관계를 추론하는 분야가 있다. 이런 상황에서 많이 쓰인다. 
# https://www.nltk.org/howto/wordnet.html    영어 유사도를 확인하는 사이트이다.

0.5

In [19]:
wordnet.synsets('car')
# car 단어에 5개의 동의어 그룹이 있다. 자동차, 명사, 인덱스 번호로 나눠져있다. 

[Synset('car.n.01'),
 Synset('car.n.02'),
 Synset('car.n.03'),
 Synset('car.n.04'),
 Synset('cable_car.n.01')]

In [20]:
wordnet.synsets('car')[0]# 0번째 인덱스를 추출한다. 

Synset('car.n.01')

In [22]:
car = wordnet.synset('car.n.01')# 정의를 확인할 수 있다.
car.definition() # 자동차에 대한 설명이 나온다.

'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

In [23]:
car.lemma_names() #표제어에 해당하는 단어를 볼 수 있다.
#

['car', 'auto', 'automobile', 'machine', 'motorcar']

In [24]:
wordnet.synsets('computer')

[Synset('computer.n.01'), Synset('calculator.n.01')]

In [27]:
computer = wordnet.synset('computer.n.01')
computer.definition() # 컴퓨터에 대한 사전적 설명이 나온다. 

'a machine for performing calculations automatically'

In [28]:
computer.lemma_names() # 연관된 내용들이 나온다. 

['computer',
 'computing_machine',
 'computing_device',
 'data_processor',
 'electronic_computer',
 'information_processing_system']

In [2]:
# tfidf기반 영화추천시스템
# 실행과정
# 즐겁게 봤던 영화 제목을 입력하세요. (인풋받고)
# 당신에게 추천하고 싶은 영화 제목은 아래와 같습니다.
# 10개 추천
# ------------------
# 개발과정
# 1) overview열을 추출한다 - > 코퍼스를 구성한다.
# 2) 단어들에 대한 전처리 작업을 수행한다. 또는 단어를 전처리 수행하고 코퍼스를 구성한다.
# 단어 전처리 : 불용어제거, 대소문자,특수문자 처리, 단어통일(wordnet 이용 추천), 정규표현식 사용도 고려하기
# 3) tfidf 행렬을 생성한다.
# 4) 코사인유사도로 영화를 추천한다.
data = pd.read_csv("archive/movies_metadata.csv")
data

  data = pd.read_csv("archive/movies_metadata.csv")


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0
45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,2011-11-17,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0
45463,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,2003-08-01,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0


In [None]:
# 갯수는 5000편으로 제한한다. 어제 하던 방식으로 처리하라. 
# 만든 이후 카페에 제출한다.

In [30]:
prac = data.head(5000)
dataset1 = prac['overview']

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,False,,0,"[{'id': 53, 'name': 'Thriller'}, {'id': 27, 'n...",,43715,tt0050294,en,The Deadly Mantis,The calving of an Arctic iceberg releases a gi...,...,1957-05-01,0.0,79.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,This Was the Day That Engulfed the World in Te...,The Deadly Mantis,False,5.3,16.0
4996,False,,60000000,"[{'id': 18, 'name': 'Drama'}]",,10052,tt0259288,en,Dragonfly,A grieving doctor is being contacted by his la...,...,2002-02-22,52322400.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,When someone you love dies... are they gone fo...,Dragonfly,False,6.2,209.0
4997,False,"{'id': 217704, 'name': 'The Vampire Chronicles...",35000000,"[{'id': 18, 'name': 'Drama'}, {'id': 14, 'name...",,11979,tt0238546,en,Queen of the Damned,Lestat de Lioncourt is awakened from his slumb...,...,2002-02-10,45479110.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,This time there are no interviews.,Queen of the Damned,False,5.5,247.0
4998,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,75151,tt0260746,en,Big Bad Love,Vietnam veteran Leon Barlow is struggling as a...,...,2001-10-11,0.0,111.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Big Bad Love,False,6.5,4.0


In [None]:
from nltk.tokenize import word_tokenize # 영어로 된 글자를 토큰화시킨다. 
from nltk.tokenize import WordPunctTokenizer # 위랑 비슷하다. 다만 점을 분리하는 방식이 다르다. 
from tensorflow.keras.preprocessing.text import text_to_word_sequence 

In [35]:
# 1차 단어토큰화
print('단어 토큰화1 :',word_tokenize(dataset1[0]))

단어 토큰화1 : ['Led', 'by', 'Woody', ',', 'Andy', "'s", 'toys', 'live', 'happily', 'in', 'his', 'room', 'until', 'Andy', "'s", 'birthday', 'brings', 'Buzz', 'Lightyear', 'onto', 'the', 'scene', '.', 'Afraid', 'of', 'losing', 'his', 'place', 'in', 'Andy', "'s", 'heart', ',', 'Woody', 'plots', 'against', 'Buzz', '.', 'But', 'when', 'circumstances', 'separate', 'Buzz', 'and', 'Woody', 'from', 'their', 'owner', ',', 'the', 'duo', 'eventually', 'learns', 'to', 'put', 'aside', 'their', 'differences', '.']


In [38]:
# 2차 단어 토큰화
print('단어 토큰화2 :',WordPunctTokenizer().tokenize(dataset1[0]))

단어 토큰화2 : ['Led', 'by', 'Woody', ',', 'Andy', "'", 's', 'toys', 'live', 'happily', 'in', 'his', 'room', 'until', 'Andy', "'", 's', 'birthday', 'brings', 'Buzz', 'Lightyear', 'onto', 'the', 'scene', '.', 'Afraid', 'of', 'losing', 'his', 'place', 'in', 'Andy', "'", 's', 'heart', ',', 'Woody', 'plots', 'against', 'Buzz', '.', 'But', 'when', 'circumstances', 'separate', 'Buzz', 'and', 'Woody', 'from', 'their', 'owner', ',', 'the', 'duo', 'eventually', 'learns', 'to', 'put', 'aside', 'their', 'differences', '.']


In [39]:
# 3차 단어 토큰화
print('단어 토큰화3 :',text_to_word_sequence(dataset1[0]))

단어 토큰화3 : ['led', 'by', 'woody', "andy's", 'toys', 'live', 'happily', 'in', 'his', 'room', 'until', "andy's", 'birthday', 'brings', 'buzz', 'lightyear', 'onto', 'the', 'scene', 'afraid', 'of', 'losing', 'his', 'place', 'in', "andy's", 'heart', 'woody', 'plots', 'against', 'buzz', 'but', 'when', 'circumstances', 'separate', 'buzz', 'and', 'woody', 'from', 'their', 'owner', 'the', 'duo', 'eventually', 'learns', 'to', 'put', 'aside', 'their', 'differences']


In [None]:
# 전체 추출한 결과. 2번을 쓸 것이다. 콤마가 명확하게 나눠져 있기에 가공이 더 원할하게 수행될 것으로 판단된다. 

In [47]:
test_token1= WordPunctTokenizer().tokenize(dataset1[0])
test_token1

['Led',
 'by',
 'Woody',
 ',',
 'Andy',
 "'",
 's',
 'toys',
 'live',
 'happily',
 'in',
 'his',
 'room',
 'until',
 'Andy',
 "'",
 's',
 'birthday',
 'brings',
 'Buzz',
 'Lightyear',
 'onto',
 'the',
 'scene',
 '.',
 'Afraid',
 'of',
 'losing',
 'his',
 'place',
 'in',
 'Andy',
 "'",
 's',
 'heart',
 ',',
 'Woody',
 'plots',
 'against',
 'Buzz',
 '.',
 'But',
 'when',
 'circumstances',
 'separate',
 'Buzz',
 'and',
 'Woody',
 'from',
 'their',
 'owner',
 ',',
 'the',
 'duo',
 'eventually',
 'learns',
 'to',
 'put',
 'aside',
 'their',
 'differences',
 '.']

In [43]:
import re
text = dataset1[0]

# 길이가 1~2인 단어들을 정규 표현식을 이용하여 삭제
shortword = re.compile(r'\W*\b\w{1,2}\b')
test_token2 = shortword.sub('', text) 
test_token2

'Led Woody, Andy toys live happily his room until Andy birthday brings Buzz Lightyear onto the scene. Afraid losing his place Andy heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns put aside their differences.'

In [51]:
test_token3= WordPunctTokenizer().tokenize(test_token2)
test_token3

['Led',
 'Woody',
 ',',
 'Andy',
 'toys',
 'live',
 'happily',
 'his',
 'room',
 'until',
 'Andy',
 'birthday',
 'brings',
 'Buzz',
 'Lightyear',
 'onto',
 'the',
 'scene',
 '.',
 'Afraid',
 'losing',
 'his',
 'place',
 'Andy',
 'heart',
 ',',
 'Woody',
 'plots',
 'against',
 'Buzz',
 '.',
 'But',
 'when',
 'circumstances',
 'separate',
 'Buzz',
 'and',
 'Woody',
 'from',
 'their',
 'owner',
 ',',
 'the',
 'duo',
 'eventually',
 'learns',
 'put',
 'aside',
 'their',
 'differences',
 '.']

In [52]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print('표제어 추출 전 :',test_token3)
print('표제어 추출 후 :',[lemmatizer.lemmatize(word) for word in test_token3])

표제어 추출 전 : ['Led', 'Woody', ',', 'Andy', 'toys', 'live', 'happily', 'his', 'room', 'until', 'Andy', 'birthday', 'brings', 'Buzz', 'Lightyear', 'onto', 'the', 'scene', '.', 'Afraid', 'losing', 'his', 'place', 'Andy', 'heart', ',', 'Woody', 'plots', 'against', 'Buzz', '.', 'But', 'when', 'circumstances', 'separate', 'Buzz', 'and', 'Woody', 'from', 'their', 'owner', ',', 'the', 'duo', 'eventually', 'learns', 'put', 'aside', 'their', 'differences', '.']
표제어 추출 후 : ['Led', 'Woody', ',', 'Andy', 'toy', 'live', 'happily', 'his', 'room', 'until', 'Andy', 'birthday', 'brings', 'Buzz', 'Lightyear', 'onto', 'the', 'scene', '.', 'Afraid', 'losing', 'his', 'place', 'Andy', 'heart', ',', 'Woody', 'plot', 'against', 'Buzz', '.', 'But', 'when', 'circumstance', 'separate', 'Buzz', 'and', 'Woody', 'from', 'their', 'owner', ',', 'the', 'duo', 'eventually', 'learns', 'put', 'aside', 'their', 'difference', '.']


In [55]:
from nltk.corpus import stopwords

In [74]:
dots = [",","."]
user_setting = list(stopwords.words('english'))
stop_words = set(user_setting) 
result = []
for word in test_token3: 
    if word not in stop_words: 
        result.append(word)# 단어가 만약 불용어사전에 없으면 담는다. 
    if word in dots: 
        result.pop()
test_token4 = result
test_token4

['Led',
 'Woody',
 'Andy',
 'toys',
 'live',
 'happily',
 'room',
 'Andy',
 'birthday',
 'brings',
 'Buzz',
 'Lightyear',
 'onto',
 'scene',
 'Afraid',
 'losing',
 'place',
 'Andy',
 'heart',
 'Woody',
 'plots',
 'Buzz',
 'But',
 'circumstances',
 'separate',
 'Buzz',
 'Woody',
 'owner',
 'duo',
 'eventually',
 'learns',
 'put',
 'aside',
 'differences']

In [None]:
N = len(docs) # 전체 문서의 길이

def tf(t, d): # 단어와 문서를 전달받는다. 
  return d.count(t) # t라는 단어의 빈도수를 본다. 

def idf(t):
  df = 0
  for doc in docs: 
    df += t in doc # 문서의 내부의 값이 있으면 df에 1씩 더한다. 
  return log(N/(df+1)) # 문서 전체를 로그로 씌운 후 전체문서 길이에 df+1을 나눈다. 

def tfidf(t, d):
  return tf(t,d)* idf(t) # tf와 idf를 곱하는 것이 tfidf이다. 

In [112]:
# 토탈리스트 제작 
total_list = []
for i in range(len(dataset1)):
    shortword = re.compile(r'\W*\b\w{1,2}\b')
    test_token2 = shortword.sub('', str(dataset1[i])) 
    test_token3= WordPunctTokenizer().tokenize(test_token2)
    test_token4 = []
    for i in test_token3:
        shortword2 = re.compile(r'\W')
        test_to = shortword2.sub('', i)
        test_token4.append(test_to)
    stop_words = set(stopwords.words('english'))
    result = []
    for word in test_token4: 
        if word not in stop_words: 
            result.append(word)
        if word in '': 
            result.pop()
    total_list.append(result)
print(total_list)

[['Led', 'Woody', 'Andy', 'toys', 'live', 'happily', 'room', 'Andy', 'birthday', 'brings', 'Buzz', 'Lightyear', 'onto', 'scene', 'Afraid', 'losing', 'place', 'Andy', 'heart', 'Woody', 'plots', 'Buzz', 'But', 'circumstances', 'separate', 'Buzz', 'Woody', 'owner', 'duo', 'eventually', 'learns', 'put', 'aside', 'differences'], ['When', 'siblings', 'Judy', 'Peter', 'discover', 'enchanted', 'board', 'game', 'opens', 'door', 'magical', 'world', 'unwittingly', 'invite', 'Alan', 'adult', 'trapped', 'inside', 'game', 'years', 'living', 'room', 'Alan', 'hope', 'freedom', 'finish', 'game', 'proves', 'risky', 'three', 'find', 'running', 'giant', 'rhinoceroses', 'evil', 'monkeys', 'terrifying', 'creatures'], ['family', 'wedding', 'reignites', 'ancient', 'feud', 'next', 'door', 'neighbors', 'fishing', 'buddies', 'John', 'Max', 'Meanwhile', 'sultry', 'Italian', 'divorcée', 'opens', 'restaurant', 'local', 'bait', 'shop', 'alarming', 'locals', 'worry', 'scare', 'fish', 'away', 'But', 'less', 'intereste

In [117]:
# 세트리스트 제작
set_list = set()
for i in total_list:
    for j in i:
        set_list.add(j)
set_list = list(set_list)
set_list

['missing',
 'Rath',
 'free',
 'together',
 'summer',
 'give',
 'tenth',
 'made',
 'able',
 'teens',
 'Croft',
 'wayward',
 'pursues',
 'burn',
 'steal',
 'Hanna',
 'Shepherd',
 'Dora',
 'confidants',
 'travels',
 'fact',
 'uncle',
 'desperate',
 'former',
 'whose',
 'switch',
 'Waxing',
 'running',
 'Bond',
 'action',
 'hockey',
 'process',
 'predicaments',
 'take',
 'execution',
 'protect',
 'Cleary',
 'almost',
 'sample',
 'Depicts',
 'place',
 'leader',
 'persuaded',
 'decorum',
 'separate',
 'gives',
 'Judy',
 'friends',
 'tribe',
 'sent',
 'wild',
 'GoldenEye',
 'remains',
 'Susan',
 'outrageous',
 'Former',
 'lord',
 'partners',
 'dollars',
 'guests',
 'leaving',
 'rules',
 'warrior',
 'pecking',
 'His',
 'mysterious',
 'business',
 'hopelessly',
 'provided',
 'York',
 'memory',
 'proves',
 'gambling',
 'throw',
 'orientation',
 'pirate',
 'company',
 'Zimm',
 'Chrissy',
 'bullies',
 'overcome',
 'treasure',
 'aware',
 'epic',
 'Bones',
 'children',
 'progress',
 'save',
 'attra

In [144]:
# 리스트 내 이름 유사도 체크 함수
def name_similarity(set_list):
    set_list2 =[]
    for i in set_list:
        if wordnet.synsets(i) == []:
            continue
        elif wordnet.synsets(i) !=[]:
            name_tag = wordnet.synsets(i)[0]
            searcher =name_tag.lemma_names()
            if i in searcher:
                set_list2.append(i)
            else:
                set_list2.append(searcher[0])
    return set_list2
            

In [148]:
set_list2 =list(set(name_similarity(set_list)))

In [146]:
N = len(total_list) # 전체 문서의 길이

def tf(t, d): # 단어와 문서를 전달받는다. 
    return d.count(t) # t라는 단어의 빈도수를 본다. 

def idf(t):
    df = 0
    for doc in total_list: 
        df += t in doc # 문서의 내부의 값이 있으면 df에 1씩 더한다. 
    return log(N/(df+1)) # 문서 전체를 로그로 씌운 후 전체문서 길이에 df+1을 나눈다. 

def tfidf(t, d):
    return tf(t,d)* idf(t) # tf와 idf를 곱하는 것이 tfidf이다. 

In [152]:
N = len(total_list) # 전체 문서의 길이

def tf(t, d): # 단어와 문서를 전달받는다. 
    return d.count(t) # t라는 단어의 빈도수를 본다. 

def idf(t):
    df = 0
    for doc in total_list: 
        if t in doc:
            df += 1 # 문서의 내부의 값이 있으면 df에 1씩 더한다. 
    return log(N/(df+1)) # 문서 전체를 로그로 씌운 후 전체문서 길이에 df+1을 나눈다. 

def tfidf(t, d):
    return tf(t,d)* idf(t) # tf와 idf를 곱하는 것이 tfidf이다. 

In [156]:
result = []

# 각 문서에 대해서 아래 연산을 반복
for i in range(N): # 전체 문서의 길이만큼 반복하라. 
    result.append([])# 결과에 비어있는 리스트를 대입하라.
    d = total_list[i] # i번째 문서를 d로 정의하라
    for j in range(len(set_list2)): # vocab의 길이만큼 확인하라
        t = set_list2[j] # vocap의 단어를 t에 대입해라.
        result[-1].append(tfidf(t, d)) # 과일의 라는 단어를 tf함수에 전달한다.
                                # tf(원하는 단어, 문서 전체)순으로 정렬하여 문서에 몇번 나왔는지 카운트한다. 
                                # 얘들이 들어가있다. 문서별 단어의 빈도수이다. 이것으로 데이터프레임을 만들었다. 
tfidf_ = pd.DataFrame(result, columns = set_list2)
tfidf_

Unnamed: 0,May,free,keep,background,summer,together,tenth,verbal,address,able,...,photograph,drinking,ashram,agoraphobic,half,recognize,miss,see,coerce,kind
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [157]:
cosine_similarity(tfidf_).shape

(50, 50)

In [177]:
ans = cosine_similarity(tfidf_)
ans

array([[1.        , 0.02859766, 0.        , ..., 0.        , 0.        ,
        0.03234151],
       [0.02859766, 1.        , 0.03562571, ..., 0.        , 0.        ,
        0.02372721],
       [0.        , 0.03562571, 1.        , ..., 0.        , 0.04195776,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.07961917,
        0.        ],
       [0.        , 0.        , 0.04195776, ..., 0.07961917, 1.        ,
        0.        ],
       [0.03234151, 0.02372721, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [179]:
mov_name= input("영화 이름을 입력하세요 : ")
index_no = 0
for i in range(len(prac)):
    if prac['title'][i] == mov_name:
        break
    else:
        index_no += 1 
listyy = []
ans[index_no][index_no] = 0
for i in range(0,50):
    while len(listyy) < 10:
        listyy.append(prac['title'].iloc[np.argmax(ans[index_no])])
        ans[index_no][np.argmax(ans[index_no])] = 0  
        if np.argmax(ans[index_no]) == 0:
            break
print("추천 영화는 다음과 같습니다.")
print(listyy)

영화 이름을 입력하세요 : Four Rooms


In [180]:
listyy = []
ans[index_no][index_no] = 0
for i in range(0,50):
    while len(listyy) < 10:
        listyy.append(prac['title'].iloc[np.argmax(ans[index_no])])
        ans[index_no][np.argmax(ans[index_no])] = 0  
        if np.argmax(ans[index_no]) == 0:
            break

print(listyy)

['Assassins', 'Toy Story', 'Restoration', 'The Usual Suspects', 'Babe', 'Clueless', 'Jumanji', 'Dangerous Minds', 'Dracula: Dead and Loving It', 'The American President']


영화 이름을 입력하세요 : Father of the Bride Part II
1
2
3
4
해당영화는 4번에 위치합니다.
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [183]:
total_list_Res=[]
for i in total_list:
    total_list_Res.append([])
    total_list_Res[-1] = name_similarity(i)
total_list_Res

[['light-emitting_diode',
  'woody',
  'plaything',
  'live',
  'happily',
  'room',
  'birthday',
  'bring',
  'buzz',
  'scene',
  'afraid',
  'lose',
  'place',
  'heart',
  'woody',
  'plot',
  'buzz',
  'merely',
  'circumstances',
  'separate',
  'buzz',
  'woody',
  'owner',
  'duo',
  'eventually',
  'learn',
  'put',
  'aside',
  'difference'],
 ['sibling',
  'Peter',
  'discover',
  'enchant',
  'board',
  'game',
  'open',
  'door',
  'magical',
  'world',
  'unwittingly',
  'invite',
  'adult',
  'trap',
  'inside',
  'game',
  'years',
  'living',
  'room',
  'hope',
  'freedom',
  'finish',
  'game',
  'professor',
  'risky',
  'three',
  'find',
  'running',
  'giant',
  'rhinoceros',
  'evil',
  'monkey',
  'terrify',
  'animal'],
 ['family',
  'wedding',
  'reignite',
  'ancient',
  'feud',
  'next',
  'door',
  'neighbor',
  'fishing',
  'buddy',
  'toilet',
  'soap',
  'interim',
  'sultry',
  'Italian',
  'open',
  'restaurant',
  'local',
  'bait',
  'shop',
  'dis