## 영화 추천 프로그램
- 제목을 틀려도 가장 유사한 제목을 알아서 찾아서 검색
- 대소문자 상관없이
- 한글은 제외

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

In [3]:
import pandas as pd
df = pd.read_csv("./data/data.csv")
df.head()

Unnamed: 0.2,Unnamed: 0.1,Desc,Unnamed: 0,author,genre,image_link,rating,title
0,0,We know that power is shifting: From West to E...,0.0,Moisés Naím,Business,https://i.gr-assets.com/images/S/compressed.ph...,3.63,The End of Power: From Boardrooms to Battlefie...
1,1,Following the success of The Accidental Billio...,1.0,Blake J. Harris,Business,https://i.gr-assets.com/images/S/compressed.ph...,3.94,"Console Wars: Sega, Nintendo, and the Battle t..."
2,2,How to tap the power of social software and ne...,2.0,Chris Brogan,Business,https://i.gr-assets.com/images/S/compressed.ph...,3.78,Trust Agents: Using the Web to Build Influence...
3,3,William J. Bernstein is an American financial ...,3.0,William J. Bernstein,Business,https://i.gr-assets.com/images/S/compressed.ph...,4.2,The Four Pillars of Investing
4,4,Amazing book. And I joined Steve Jobs and many...,4.0,Akio Morita,Business,https://i.gr-assets.com/images/S/compressed.ph...,4.05,Made in Japan: Akio Morita and Sony


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2382 entries, 0 to 2381
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0.1  2382 non-null   int64  
 1   Desc          2382 non-null   object 
 2   Unnamed: 0    1185 non-null   float64
 3   author        2382 non-null   object 
 4   genre         2382 non-null   object 
 5   image_link    2382 non-null   object 
 6   rating        2382 non-null   float64
 7   title         2382 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 149.0+ KB


In [5]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re

# 아스키 문자 아니면 제거  
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128) # chr(128) : '\x80'

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

df['cleaned'] = df['Desc'].apply(_removeNonAscii)
df['cleaned'] = df.cleaned.apply(make_lower_case)
df['cleaned'] = df.cleaned.apply(remove_stop_words)
df['cleaned'] = df.cleaned.apply(remove_punctuation)
df['cleaned'] = df.cleaned.apply(remove_html)

In [6]:
# 제목도 특수문자 제거, 소문자화
df['title_cleaned'] = df['title'].apply(_removeNonAscii)
df['title_cleaned'] = df.title_cleaned.apply(make_lower_case)
df['title_cleaned'] = df.title_cleaned.apply(remove_stop_words)
df['title_cleaned'] = df.title_cleaned.apply(remove_punctuation)
df['title_cleaned'] = df.title_cleaned.apply(remove_html)
df.head(1)

Unnamed: 0.2,Unnamed: 0.1,Desc,Unnamed: 0,author,genre,image_link,rating,title,cleaned,title_cleaned
0,0,We know that power is shifting: From West to E...,0.0,Moisés Naím,Business,https://i.gr-assets.com/images/S/compressed.ph...,3.63,The End of Power: From Boardrooms to Battlefie...,know power shifting west east north south pres...,end power boardrooms battlefields churches sta...


In [7]:
# (1) 가장 유사한 제목 검색
# 코사인 유사도, fuzzywuzzy 사용

In [8]:
# !pip install fuzzywuzzy python-Levenshtein

In [9]:
# 라이브러리 import
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process

# 데이터 로드
df2 = df.copy()
titles = df2['title_cleaned'].tolist()
titles_origin = df2['title'].tolist()

In [10]:
# 코사인 유사도

# TF-IDF 벡터화
tfidf_vect = TfidfVectorizer()
tfidf_matrix = tfidf_vect.fit_transform(titles)

def find_similar_titles(query, top_n=3):
    
    # 입력 쿼리를 TF-IDF 벡터로 변환
    query_vector = tfidf_vect.transform([query])

    # 코사인 유사도 계산
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)

    # 유사도 점수와 인덱스를 함께 저장
    similarity_with_index = list(enumerate(similarity_scores[0]))

    # 유사도 점수를 기준으로 내림차순 정렬
    sorted_similarity = sorted(similarity_with_index, key=lambda x: x[1], reverse=True)

    # 상위 N개 결과 추출
    top_results = sorted_similarity[:top_n]

    # 결과 반환 (제목과 유사도 점수)
    results = [(titles_origin[i], score) for i, score in top_results]
    print(f"'{query}'와(과) 가장 유사한 제목의 영화 검색:\n")
    print(f"코사인 기반 가장 유사한 제목:")
    for i, (title, score) in enumerate(results):
        print(f"{i+1}. {title:50} (유사도: {score:.4f})") # 제목 길이 50으로 맞추고, 유사도는 소수점 4자리까지 출력
    return results

In [11]:
# fuzzywuzzy 사용

def fuzzy_find_best_match(query, choices):
    best_match, score = process.extractOne(query, choices)
    rows = df2[df2['title_cleaned'] == best_match]
    best_match_title = rows['title'].iloc[0]
    return best_match_title, score

In [12]:
print("-----------------대소문자 오타 발생 시------------------\n")

find_similar_titles("the Da vinci codE")
best_match, score = fuzzy_find_best_match("the Da vinci codE", titles)
print(f'\nFuzzy Matching 기반 가장 유사한 제목:')
print(f'{best_match} (점수: {score})')

-----------------대소문자 오타 발생 시------------------

'the Da vinci codE'와(과) 가장 유사한 제목의 영화 검색:

코사인 기반 가장 유사한 제목:
1. The Da Vinci Code                                  (유사도: 0.8771)
2. Winnie-the-Pooh                                    (유사도: 0.2551)
3. Code Name Verity                                   (유사도: 0.2231)


[('The Da Vinci Code', 0.8771443900261693),
 ('Winnie-the-Pooh', 0.2550895649146426),
 ('Code Name Verity', 0.22310056769463926)]


Fuzzy Matching 기반 가장 유사한 제목:
The Da Vinci Code (점수: 95)


In [13]:
print("-----------------띄어쓰기 오류 발생 시------------------\n")

find_similar_titles("dav inci cod e")
best_match, score = fuzzy_find_best_match("dav inci cod e", titles)
print(f'\nFuzzy Matching 기반 가장 유사한 제목:')
print(f'{best_match} (점수: {score})')

-----------------띄어쓰기 오류 발생 시------------------

'dav inci cod e'와(과) 가장 유사한 제목의 영화 검색:

코사인 기반 가장 유사한 제목:
1. The End of Power: From Boardrooms to Battlefields and Churches to States, Why Being In Charge Isn't What It Used to Be (유사도: 0.0000)
2. Console Wars: Sega, Nintendo, and the Battle that Defined a Generation (유사도: 0.0000)
3. Trust Agents: Using the Web to Build Influence, Improve Reputation, and Earn Trust (유사도: 0.0000)


[("The End of Power: From Boardrooms to Battlefields and Churches to States, Why Being In Charge Isn't What It Used to Be",
  0.0),
 ('Console Wars: Sega, Nintendo, and the Battle that Defined a Generation',
  0.0),
 ('Trust Agents: Using the Web to Build Influence, Improve Reputation, and Earn Trust',
  0.0)]


Fuzzy Matching 기반 가장 유사한 제목:
The Da Vinci Code (점수: 89)


In [14]:
print("-----------------철자를 틀렸을 시 1------------------\n")

find_similar_titles("da vinciiii codhh")
best_match, score = fuzzy_find_best_match("da vinci codhh", titles)
print(f'\nFuzzy Matching 기반 가장 유사한 제목:')
print(f'{best_match} (점수: {score})')

-----------------철자를 틀렸을 시 1------------------

'da vinciiii codhh'와(과) 가장 유사한 제목의 영화 검색:

코사인 기반 가장 유사한 제목:
1. The Da Vinci Code                                  (유사도: 0.6175)
2. The End of Power: From Boardrooms to Battlefields and Churches to States, Why Being In Charge Isn't What It Used to Be (유사도: 0.0000)
3. Console Wars: Sega, Nintendo, and the Battle that Defined a Generation (유사도: 0.0000)


[('The Da Vinci Code', 0.6174876214538709),
 ("The End of Power: From Boardrooms to Battlefields and Churches to States, Why Being In Charge Isn't What It Used to Be",
  0.0),
 ('Console Wars: Sega, Nintendo, and the Battle that Defined a Generation',
  0.0)]


Fuzzy Matching 기반 가장 유사한 제목:
The Da Vinci Code (점수: 89)


In [15]:
print("-----------------철자를 틀렸을 시 2------------------\n")

find_similar_titles("dE voncI cOdee")
best_match, score = fuzzy_find_best_match("dE voncI cOdee", titles)
print(f'\nFuzzy Matching 기반 가장 유사한 제목:')
print(f'{best_match} (점수: {score})')

-----------------철자를 틀렸을 시 2------------------

'dE voncI cOdee'와(과) 가장 유사한 제목의 영화 검색:

코사인 기반 가장 유사한 제목:
1. The Thousand Autumns of Jacob de Zoet              (유사도: 0.4618)
2. The End of Power: From Boardrooms to Battlefields and Churches to States, Why Being In Charge Isn't What It Used to Be (유사도: 0.0000)
3. Console Wars: Sega, Nintendo, and the Battle that Defined a Generation (유사도: 0.0000)


[('The Thousand Autumns of Jacob de Zoet', 0.4618398305594133),
 ("The End of Power: From Boardrooms to Battlefields and Churches to States, Why Being In Charge Isn't What It Used to Be",
  0.0),
 ('Console Wars: Sega, Nintendo, and the Battle that Defined a Generation',
  0.0)]


Fuzzy Matching 기반 가장 유사한 제목:
The Thousand Autumns of Jacob de Zoet (점수: 86)


In [16]:
print("-----------------영화 제목의 일부만 검색했을 시------------------\n")

find_similar_titles("the end of the tower")
best_match, score = fuzzy_find_best_match("the end of the tower", titles)
print(f'\nFuzzy Matching 기반 가장 유사한 제목:')
print(f'{best_match} (점수: {score})')

-----------------영화 제목의 일부만 검색했을 시------------------

'the end of the tower'와(과) 가장 유사한 제목의 영화 검색:

코사인 기반 가장 유사한 제목:
1. Winnie-the-Pooh                                    (유사도: 0.4347)
2. The Dark Tower                                     (유사도: 0.3623)
3. My Life And Work (The Autobiography Of Henry Ford) (유사도: 0.3537)


[('Winnie-the-Pooh', 0.43466145166574033),
 ('The Dark Tower', 0.36225368281990367),
 ('My Life And Work (The Autobiography Of Henry Ford)', 0.3537315139373847)]


Fuzzy Matching 기반 가장 유사한 제목:
The End of Power: From Boardrooms to Battlefields and Churches to States, Why Being In Charge Isn't What It Used to Be (점수: 86)


In [17]:
##################################

In [18]:
# (3) 영화 추천 모델 구축

In [19]:
from gensim.models.doc2vec import TaggedDocument
from konlpy.tag import Okt
okt = Okt()

In [20]:
%%time
from tqdm import tqdm

tagged_corpus_list = []

for index, row in tqdm(df2.iterrows(), total=len(df2)):
  text = row['cleaned']
  tag = row['title']
  tagged_corpus_list.append(TaggedDocument(tags=[tag], words=okt.morphs(text)))

print('문서의 수 :', len(tagged_corpus_list))

100%|█████████████████████████████████████████████████████████████████████████████| 2382/2382 [00:14<00:00, 162.83it/s]

문서의 수 : 2382
CPU times: total: 17.5 s
Wall time: 14.7 s





In [21]:
from gensim.models import Doc2Vec
model = Doc2Vec(vector_size=300, workers=8, window=8)

In [22]:
# Vocabulary 빌드
model.build_vocab(tagged_corpus_list)

# Doc2Vec 학습
model.train(tagged_corpus_list, total_examples=model.corpus_count, epochs=20)

In [23]:
# 함수 작성
# fuzzywuzzy 함수 간략화
def fuzzy_find_best_match(query, choices):
    best_match, score = process.extractOne(query, choices)
    rows = df2[df2['title_cleaned'] == best_match]
    best_match_title = rows['title'].iloc[0]
    return best_match_title
    
# 도서 추천 함수 작성
def book_recommend(title):

    try:
        similar_doc = model.dv.most_similar(title)
        print(f'[{title}과(와) 유사한 추천 도서]\n')
        for i in similar_doc:
            print(i[0])
    except KeyError:
        print(f"입력하신 도서 '{title}'에 대한 정보를 찾을 수 없습니다.")

In [24]:
# 도서명(title) 입력 
title = input('도서명 입력: \n')
best_title = fuzzy_find_best_match(title, titles)
book_recommend(best_title)

도서명 입력: 
 da vinchhhhhhh codrrrr


[The Da Vinci Code과(와) 유사한 추천 도서]

State of Wonder
The Divine Comedy
Inferno
All the Light We Cannot See
Snow
Watership Down
Madame Bovary
The Girl with the Dragon Tattoo
Origin
The Drawing of the Three
