https://wikidocs.net/30708

In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
#시간별로 vote 수
popular=pd.read_csv("/content/drive/MyDrive/Github/netflix_EDA/data/popular_des.csv",index_col=0)
no_popular = pd.read_csv("/content/drive/MyDrive/Github/netflix_EDA/data/non_popular_des.csv",index_col=0)

In [3]:
popular[:5]

Unnamed: 0,description_x
1,"On a photo shoot in Ghana, an American model s..."
4,When the father of the man she loves insists t...
5,"Blackmailed by his company's CEO, a low-level ..."
7,A rookie cop with one day to prove himself to ...
8,When a powerful businesswoman’s political ambi...


In [4]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
len(documents)

11314

In [5]:
import numpy as np

popular_list = popular.values.tolist()
no_popular_list = no_popular.values.tolist()
popular_list[:5]

[['On a photo shoot in Ghana, an American model slips back in time, becomes enslaved on a plantation and bears witness to the agony of her ancestral past.'],
 ['When the father of the man she loves insists that his twin sons marry twin sisters, a woman creates an alter ego that might be a bit too convincing.'],
 ["Blackmailed by his company's CEO, a low-level employee finds himself forced to spy on the boss's rival and former mentor."],
 ["A rookie cop with one day to prove himself to a veteran LAPD narcotics officer receives a crash course in his mentor's questionable brand of justice."],
 ['When a powerful businesswoman’s political ambitions are threatened by her underworld connections, the ensuing power struggle could cost her everything.']]

In [6]:
import itertools
popular_list =list(itertools.chain.from_iterable(popular_list))
no_popular_list=list(itertools.chain.from_iterable(no_popular_list))

In [7]:
#텍스트 전처리
news_df = pd.DataFrame({'document':popular_list})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  after removing the cwd from sys.path.


In [8]:
news_df['clean_doc'][1]

'when father loves insists that twin sons marry twin sisters woman creates alter that might convincing'

In [9]:
import nltk
nltk.download('stopwords')
  
from nltk.corpus import stopwords
stop_words = stopwords.words('english') # NLTK로부터 불용어를 받아옵니다.
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
# 불용어를 제거합니다.

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
print(tokenized_doc[1])

['father', 'loves', 'insists', 'twin', 'sons', 'marry', 'twin', 'sisters', 'woman', 'creates', 'alter', 'might', 'convincing']


In [11]:
#3) TF-IDF 행렬 만들기
# 역토큰화 (토큰화 작업을 역으로 되돌림)
detokenized_doc = []
for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

news_df['clean_doc'] = detokenized_doc

In [12]:
news_df['clean_doc'][1]

'father loves insists twin sons marry twin sisters woman creates alter might convincing'

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000, # 상위 1,000개의 단어를 보존 
max_df = 0.5, 
smooth_idf=True)

X = vectorizer.fit_transform(news_df['clean_doc'])
X.shape # TF-IDF 행렬의 크기 확인

(1577, 1000)

In [14]:
from sklearn.decomposition import TruncatedSVD
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)
svd_model.fit(X)
len(svd_model.components_)

20

In [15]:
np.shape(svd_model.components_)


(20, 1000)

In [16]:
terms = vectorizer.get_feature_names() # 단어 집합. 1,000개의 단어가 저장됨.

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(svd_model.components_,terms)

Topic 1: [('young', 0.34865), ('life', 0.26953), ('love', 0.21869), ('family', 0.20009), ('woman', 0.19048)]
Topic 2: [('love', 0.60574), ('life', 0.22898), ('romance', 0.17976), ('falls', 0.17668), ('fall', 0.14227)]
Topic 3: [('young', 0.61371), ('woman', 0.4004), ('help', 0.12159), ('loves', 0.10318), ('couple', 0.10212)]
Topic 4: [('life', 0.46218), ('father', 0.15833), ('takes', 0.14946), ('town', 0.13961), ('crime', 0.13195)]
Topic 5: [('life', 0.4894), ('family', 0.32061), ('past', 0.20567), ('lives', 0.09255), ('struggling', 0.09193)]
Topic 6: [('family', 0.41064), ('town', 0.30892), ('small', 0.23787), ('young', 0.18147), ('secrets', 0.13583)]
Topic 7: [('father', 0.61819), ('help', 0.14352), ('love', 0.12547), ('daughter', 0.10466), ('face', 0.09801)]
Topic 8: [('world', 0.32757), ('girl', 0.30872), ('young', 0.18875), ('dark', 0.16831), ('secrets', 0.16524)]
Topic 9: [('help', 0.39328), ('finds', 0.20436), ('couple', 0.17783), ('world', 0.16336), ('life', 0.15663)]
Topic 10:

In [17]:
tokenized_doc[:5]

0    [photo, shoot, ghana, american, model, slips, ...
1    [father, loves, insists, twin, sons, marry, tw...
2    [blackmailed, company, level, employee, finds,...
3    [rookie, prove, veteran, lapd, narcotics, offi...
4    [powerful, businesswoman, political, ambitions...
Name: clean_doc, dtype: object

In [18]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[1]) # 수행된 결과에서 두번째 뉴스 출력. 첫번째 문서의 인덱스는 0


[(16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 2), (27, 1)]


In [19]:
print(dictionary[66])

detective


In [20]:
len(dictionary)

6141

In [21]:
import gensim
NUM_TOPICS = 20 #20개의 토픽, k=20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.008*"life" + 0.008*"girl" + 0.008*"world" + 0.007*"finds"')
(1, '0.009*"life" + 0.006*"woman" + 0.005*"father" + 0.005*"underworld"')
(2, '0.010*"couple" + 0.009*"home" + 0.008*"young" + 0.006*"friends"')
(3, '0.016*"life" + 0.010*"young" + 0.009*"three" + 0.004*"family"')
(4, '0.009*"young" + 0.007*"woman" + 0.007*"family" + 0.006*"gang"')
(5, '0.014*"young" + 0.007*"must" + 0.005*"family" + 0.005*"group"')
(6, '0.011*"home" + 0.010*"young" + 0.009*"love" + 0.007*"friends"')
(7, '0.007*"death" + 0.007*"family" + 0.006*"help" + 0.006*"home"')
(8, '0.017*"young" + 0.007*"woman" + 0.006*"girl" + 0.005*"desperate"')
(9, '0.011*"brother" + 0.007*"love" + 0.006*"girl" + 0.005*"father"')
(10, '0.010*"family" + 0.009*"young" + 0.007*"years" + 0.006*"love"')
(11, '0.008*"life" + 0.007*"father" + 0.005*"love" + 0.005*"small"')
(12, '0.009*"lives" + 0.006*"becomes" + 0.005*"find" + 0.005*"friends"')
(13, '0.006*"family" + 0.005*"unlikely" + 0.005*"father" + 0.005*"goes"')
(14, '0.011*"you

In [22]:
print(ldamodel.print_topics())

[(0, '0.008*"life" + 0.008*"girl" + 0.008*"world" + 0.007*"finds" + 0.005*"dark" + 0.005*"caught" + 0.004*"take" + 0.004*"back" + 0.004*"young" + 0.004*"human"'), (1, '0.009*"life" + 0.006*"woman" + 0.005*"father" + 0.005*"underworld" + 0.004*"daughter" + 0.004*"family" + 0.004*"wife" + 0.004*"brother" + 0.004*"past" + 0.004*"prepare"'), (2, '0.010*"couple" + 0.009*"home" + 0.008*"young" + 0.006*"friends" + 0.006*"mother" + 0.006*"moves" + 0.005*"lives" + 0.005*"family" + 0.004*"finds" + 0.004*"group"'), (3, '0.016*"life" + 0.010*"young" + 0.009*"three" + 0.004*"family" + 0.003*"mother" + 0.003*"escape" + 0.003*"town" + 0.003*"small" + 0.003*"behind" + 0.003*"forced"'), (4, '0.009*"young" + 0.007*"woman" + 0.007*"family" + 0.006*"gang" + 0.006*"life" + 0.006*"truth" + 0.005*"father" + 0.005*"crime" + 0.005*"mysterious" + 0.004*"town"'), (5, '0.014*"young" + 0.007*"must" + 0.005*"family" + 0.005*"group" + 0.005*"woman" + 0.005*"whose" + 0.004*"college" + 0.004*"based" + 0.004*"drama" + 

In [23]:
!pip install pyLDAvis



# popular movie topic classification

In [25]:
# !pip install pyLDAvis
!pip install pyLDAvis

import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
vis



  from collections import Iterable
  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [26]:
!pip install pandas --upgrade

