<a href="https://colab.research.google.com/github/nooblette/SentimentAnalysis/blob/main/MovieReview_SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
%matplotlib inline
#!apt-get update
#!apt-get install g++ openjdk-8-jdk python-dev python3-dev
#!pip3 install JPype1-py3
#!pip3 install konlpy
#!JAVA_HOME="/usr/lib/jvm/java-1.11.0-openjdk-amd64"
#!pip install sklearn

import jpype
print(jpype.isJVMStarted()) #return False:not running or 0:running

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from urllib.request import urlopen, urlretrieve
from urllib.error import URLError, HTTPError
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize 
from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score

import warnings
warnings.filterwarnings('ignore')
feeling_kw_movie = dict()  # Save feeling keword list for each movie_id

stop_words = urlopen("https://raw.githubusercontent.com/nooblette/SentimentAnalysis/main/K_Stopword.txt")
stop_words = list((str(stop_words.read(), 'utf-8').split('\n')))  # 불용어 단어 리스트
stop_words += ['영화']  # 영화 리뷰 특화 불용어 단어 리스트 추가

def rating_to_label(rating):
    if rating > 5:
        return 1
    else:
        return 0

def apply_regular_expression(text):
    hangul = re.compile('[^ ㄱ-ㅣ 가-힣|0-9]')  # 한글 추출 규칙: 띄어 쓰기(1 개)를 포함한 한글
    result = hangul.sub('', text)  # 위에 설정한 "hangul"규칙을 "text"에 적용(.sub)시킴
    return result

def remove_stopword(text):
    return [word for word in text if not word in stop_words]

def text_cleaning(text):
    corpus = apply_regular_expression(text)  # 특수문자 제거
    okt = Okt()  # 명사 형태소 추출 함수
    corpus = okt.nouns(corpus)  # 명사만 추출
    corpus = [x for x in corpus if len(x) > 1]  # 두 글자 이상 단어만 사용
    corpus = remove_stopword(corpus)  # 불용어 제거
    return corpus

movie_id_feeling_keywords = dict()  # 영화별 긍정, 부정 키워드 리스트 저장

for i in range(10001, 10010):
#for i in range(10001, 142622):  # 데이터 부족으로 모든 리뷰에 대해 진행 불가
  try:
    movie_id = str(i)
    # ***************************** #
    # ****** 데이터 가져오기 ****** #
    # ***************************** #
    with urlopen("https://raw.githubusercontent.com/e9t/nsmc/master/raw/" + movie_id + ".json") as url:
      review_json_file = url.read()
      review_json = json.loads(review_json_file.decode('utf-8'))
      # review, rating, movie_id = review_json[idx]['review'], review_json[idx]['rating'], review_json[idx]['movie_id']

      vect = CountVectorizer(tokenizer = lambda x: text_cleaning(x))
      bow_vect = vect.fit_transform(review['review'] for review in review_json)
      word_list = vect.get_feature_names()
      count_list = bow_vect.toarray().sum(axis=0)
      # "단어" - "총 등장 횟수" Matching
      word_count_dict = dict(zip(word_list, count_list))




      # ************************** #
      # ********* TF-IDF ********* #
      # ************************** #
      tfidf_vectorizer = TfidfTransformer()
      tf_idf_vect = tfidf_vectorizer.fit_transform(bow_vect)
      # row -> 한 리뷰를 의미, column -> 한 단어를 의미.

      invert_index_vectorizer = {v: k for k, v in vect.vocabulary_.items()}

      count = {1:0, 0:0}  # 긍정 리뷰와 부정 리뷰의 개수 저장
      for review in review_json:
        review['label'] = rating_to_label(int(review['rating']))
        count[review['label']] += 1

      # 학습 데이터가 충분하지 않은 경우 우선 넘어가도록 진행
      if count[0] < 2 or count[1] < 2:
        continue

      x = tf_idf_vect
      y = [review['label'] for review in review_json]
      x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=1)

      # train, test data set size 확인
      # print("train data : ", x_train.shape, len(y_train), end="  /  ")
      # print("test data : ", x_test.shape, len(y_test))



      # ***************************** #
      # ********* 학습 진행 ********* #
      # ***************************** #
      # fit in training set
      lr = LogisticRegression(random_state = 0)
      lr.fit(x_train, y_train)

      # predict in test set
      y_pred = lr.predict(x_test)



      # ***************************** #
      # ********* 모델 평가 ********* #
      # ***************************** #
      # classification result for test set

      #print('accuracy: %.2f' % accuracy_score(y_test, y_pred))
      #print('precision: %.2f' % precision_score(y_test, y_pred))



      # ***************************************** #
      # ********* 긍정 부정 키워드 분석 ********* #
      # ***************************************** #
      coef_pos_index = sorted(((value, index) for index, value in enumerate(lr.coef_[0])), reverse = True)  # 긍정 키워드 리스트
      coef_neg_index = sorted(((value, index) for index, value in enumerate(lr.coef_[0])), reverse = False) # 부정 키워드 리스트

      invert_index_vectorizer = {v: k for k, v in vect.vocabulary_.items()}

      # 키워드 리스트 출력
      '''
      print("긍정 키워드 top 20")
      for coef in coef_pos_index[:20]:  # 긍정 키워드 top 20
        print(invert_index_vectorizer[coef[1]], coef[0])

      print("\n부정 키워드 top 20")
      for coef in coef_neg_index[:20]:  # 부정 키워드 top 20
        print(invert_index_vectorizer[coef[1]], coef[0])
      '''

      # 영화별로 키워드 리스트 저장
      movie_id_feeling_keywords[movie_id] = {
          'positive' : [[invert_index_vectorizer[coef[1]], coef[0]] for coef in coef_pos_index[:20] if coef[0] > 0.0],
          'negative' : [[invert_index_vectorizer[coef[1]], coef[0]] for coef in coef_neg_index[:20] if coef[0] < 0.0]
      }

  except HTTPError as e:
    continue

print("SAMPLE OUTPUT movie_id 10001 ~ 10009")
for i in range(10001, 10010):
  i = str(i)
  print('--- movie_id is ', i)
  print('positive keywords', movie_id_feeling_keywords[i]['positive'])
  print('negative keywords', movie_id_feeling_keywords[i]['negative'])
  print('\n')

f = open("output.txt", "w", encoding = 'utf-8')

for movie_id, keywords in movie_id_feeling_keywords.items():
  positive = keywords['positive']
  negative = keywords['negative']
  f.write(f'{movie_id} : {positive}, {negative}\n')

f.close()


# References
# https://hyemin-kim.github.io/2020/08/29/E-Python-TextMining-2/
# https://rfriend.tistory.com/475
# https://www.ranks.nl/stopwords/korean
# https://shakeratos.tistory.com/18

True
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
SAMPLE OUTPUT movie_id 10001 ~ 10009
--- movie_id is  10001
positive keywords [['최고', 0.17876777638710703], ['감동', 0.15382916612878633], ['명작', 0.15352480190898277], ['장면', 0.14869051138463435], ['정말', 0.14759777527434995], ['인생', 0.1244513753073513], ['프레', 0.10969079849873513], ['지금', 0.10234712629306816], ['기억', 0.10039688702096738], ['토토', 0.0895341698519974], ['다시', 0.08310456594672783], ['보고', 0.08254858175976286], ['사랑', 0.08078618988401763], ['현실', 0.08037752254429444], ['보기', 0.07966786892853783], ['나이', 0.07857144124838787], ['눈물', 0.07574250179528513], ['처음', 0.07086280792147524], ['여운', 0.06835186879435845], ['표현', 0.06647216744582057]]
negative keywords [['대체', -0.6981684884610792], ['편집', -0.5859268696705412], ['극찬', -0.5100817046212394], ['노잼', -0.5100817046212394], ['불륜', -0.40529838645896926], ['성매매', -0.40529838645896926], ['스토킹', -0.40529838645896926], 

# 새 섹션

In [None]:
""