In [1]:
# https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
# https://drive.google.com/file/d/15Q7DZ7xrJsI2Hji-WbkU9j1mwnODBd5A/view?usp=sharing
# https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4

In [2]:
import urllib.request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import re

from PIL import Image
from io import BytesIO

from nltk.tokenize import RegexpTokenizer
import nltk
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity # 코사인 유사도

In [3]:
!pip install gensim



In [4]:
!pip install Mecab



In [5]:
!pip install konlpy



In [6]:
df = pd.read_csv('./dataset/data.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Desc,Unnamed: 0.1,author,genre,image_link,rating,title
0,0,We know that power is shifting: From West to E...,0.0,Moisés Naím,Business,https://i.gr-assets.com/images/S/compressed.ph...,3.63,The End of Power: From Boardrooms to Battlefie...
1,1,Following the success of The Accidental Billio...,1.0,Blake J. Harris,Business,https://i.gr-assets.com/images/S/compressed.ph...,3.94,"Console Wars: Sega, Nintendo, and the Battle t..."
2,2,How to tap the power of social software and ne...,2.0,Chris Brogan,Business,https://i.gr-assets.com/images/S/compressed.ph...,3.78,Trust Agents: Using the Web to Build Influence...
3,3,William J. Bernstein is an American financial ...,3.0,William J. Bernstein,Business,https://i.gr-assets.com/images/S/compressed.ph...,4.2,The Four Pillars of Investing
4,4,Amazing book. And I joined Steve Jobs and many...,4.0,Akio Morita,Business,https://i.gr-assets.com/images/S/compressed.ph...,4.05,Made in Japan: Akio Morita and Sony


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2382 entries, 0 to 2381
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2382 non-null   int64  
 1   Desc          2382 non-null   object 
 2   Unnamed: 0.1  1185 non-null   float64
 3   author        2382 non-null   object 
 4   genre         2382 non-null   object 
 5   image_link    2382 non-null   object 
 6   rating        2382 non-null   float64
 7   title         2382 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 149.0+ KB


In [8]:
df.describe()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,rating
count,2382.0,1185.0,2382.0
mean,1226.073887,596.875949,3.995223
std,716.658518,346.103136,0.217733
min,0.0,0.0,3.08
25%,603.25,296.0,3.87
50%,1208.5,601.0,4.0
75%,1854.75,897.0,4.14
max,2450.0,1193.0,4.65


In [9]:
import utils_parser as parser

In [10]:
parser.make_lower_case('Abc')

'abc'

In [11]:
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

In [12]:
df['cleaned'] = df['Desc'].apply(_removeNonAscii)
df['cleaned'] = df['cleaned'].apply(make_lower_case)
df['cleaned'] = df['cleaned'].apply(remove_stop_words)
df['cleaned'] = df['cleaned'].apply(remove_punctuation)
df['cleaned'] = df['cleaned'].apply(remove_html)

In [None]:
df.head()

In [13]:
df['cleaned'].replace('', np.nan, inplace = True)

In [14]:
# notna() - 누락값 여부(누락값이 있으면 False)
df = df[df['cleaned'].notna()]

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2381 entries, 0 to 2381
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2381 non-null   int64  
 1   Desc          2381 non-null   object 
 2   Unnamed: 0.1  1185 non-null   float64
 3   author        2381 non-null   object 
 4   genre         2381 non-null   object 
 5   image_link    2381 non-null   object 
 6   rating        2381 non-null   float64
 7   title         2381 non-null   object 
 8   cleaned       2381 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 186.0+ KB


In [16]:
corpus = []
for words in df['cleaned']:
    corpus.append(words.split())

In [17]:
corpus

[['know',
  'power',
  'shifting',
  'west',
  'east',
  'north',
  'south',
  'presidential',
  'palaces',
  'public',
  'squares',
  'formidable',
  'corporate',
  'behemoths',
  'nimble',
  'startups',
  'and',
  'slowly',
  'surely',
  'men',
  'women',
  'power',
  'merely',
  'shifting',
  'dispersing',
  'also',
  'decaying',
  'power',
  'today',
  'constrained',
  'risk',
  'losing',
  'ever',
  'before',
  'end',
  'power',
  'award',
  'winning',
  'columnist',
  'former',
  'foreign',
  'policy',
  'editor',
  'moiss',
  'nam',
  'illuminates',
  'struggle',
  'once',
  'dominant',
  'megaplayers',
  'new',
  'micropowers',
  'challenging',
  'every',
  'field',
  'human',
  'endeavor',
  'drawing',
  'provocative',
  'original',
  'research',
  'nam',
  'shows',
  'antiestablishment',
  'drive',
  'micropowers',
  'topple',
  'tyrants',
  'dislodge',
  'monopolies',
  'open',
  'remarkable',
  'new',
  'opportunities',
  'also',
  'lead',
  'chaos',
  'paralysis',
  'nam',

In [18]:
!pip install Word2Vec

Collecting Word2Vec
  Using cached word2vec-0.11.1.tar.gz (42 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
Building wheels for collected packages: Word2Vec
  Building wheel for Word2Vec (PEP 517): started
  Building wheel for Word2Vec (PEP 517): finished with status 'error'
Failed to build Word2Vec


  ERROR: Command errored out with exit status 1:
   command: 'C:\Users\user\Desktop\.ipynb_checkpoints\python.exe' 'C:\Users\user\Desktop\.ipynb_checkpoints\lib\site-packages\pip\_vendor\pep517\in_process\_in_process.py' build_wheel 'C:\Users\user\AppData\Local\Temp\tmpvj11p6lo'
       cwd: C:\Users\user\AppData\Local\Temp\pip-install-bjxvotgn\word2vec_ad65ebc187654e47a7475a9b7b4ff8d4
  Complete output (47 lines):
  running bdist_wheel
  running build
  running build_py
  creating build
  creating build\lib
  creating build\lib\word2vec
  copying word2vec\io.py -> build\lib\word2vec
  copying word2vec\scripts_interface.py -> build\lib\word2vec
  copying word2vec\utils.py -> build\lib\word2vec
  copying word2vec\wordclusters.py -> build\lib\word2vec
  copying word2vec\wordvectors.py -> build\lib\word2vec
  copying word2vec\_generated_version.py -> build\lib\word2vec
  copying word2vec\__init__.py -> build\lib\word2vec
  creating build\lib\word2vec\tests
  copying word2vec\tests\test_cor

In [20]:
# 벡터화시킬 단어 수 = 300
word2vec_model = Word2Vec(size = 300, window = 5, min_count = 2, workers = 1)
word2vec_model.build_vocab(corpus)

In [24]:
word2vec_model.wv.vectors_lockf=np.ones(len(word2vec_model.wv), dtype=np.float32)

TypeError: object of type 'Word2Vec' has no len()

In [None]:
# gensim 3.x -> Word2Vec.intersect_word2vec_format
# gensim 4.x -> Word2Vec.wv.intersect_word2vec_format
word2vec_model.wv.intersect_word2vec_format('./dataset/GoogleNews-vectors-negative300.bin.gz', lockf = 1.0, binary = True)

In [None]:
word2vec_model.train(corpus, total_examples = word2vec_model.corpus_count, epochs = 15)

In [None]:
# 단어 벡터 평균
def get_doc_vectors(doc_list):
    doc_embedding_list = []
    
    for line in doc_list:
        doc2vec = None
        count = 0
        
        for word in line.split():
            if word in word2vec_model.wv.index_to_key:
                count += 1
                # 문서에 있는 모든 단어들의 벡터를 연산
                if doc2vec in None:
                    # doc2vec = word2vec_model[word]
                    doc2vec = word2vec_model.wv[word]
                else:
                    doc2vec = doc2vec + word2vec_model.wv[word]
        
        # 단어 벡터를 모두 연산한 벡터의 값을 문서 길이로 나누기
        if doc2vec is not None:
            doc2vec = doc2vec / count
            doc_embedding_list.append(doc2vec)
            
    return doc_embedding_list

In [None]:
doc_embedding_list = get_doc_vectors(df['cleaned'])
print(f'문서 벡터 수: {len(doc_embedding_list)}')

In [None]:
cosine_similarities = cosine_similarity(doc_embedding_list, doc_embedding_list)
print(cosine_similarities.shape)

In [None]:
def recommendations(title):
    columns = df[['title', 'image_link']]
    
    indices = pd.Series(df.index, index = df['title']).drop_duplicates()
    idx = indices[title]
    
    # 입력한 데이터와 유사한 데이터 추천
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6] # 상위 5개
    
    _indices = [i[0] for i in sim_scores]
    
    recommend = columns.iloc[_indices].reset_index(drop = True)
    
    fig = plt.figure(figsize = (20, 30))
    
    for index, row in recommend.iterrows():
        response = requests.get(row['image_link'])
        img = Image.open(BytesIO(response.content))
        fig.add_subplot(1, 5, index + 1)
        plt.imshow(img)
        plt.title(row['title'])

In [None]:
recommendations('The Hunger Games')