<a href="https://colab.research.google.com/github/nooblette/SentimentAnalysis/blob/main/MovieReview_SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
%matplotlib inline
#!apt-get update
#!apt-get install g++ openjdk-8-jdk python-dev python3-dev
#!pip3 install JPype1-py3
#!pip3 install konlpy
#!JAVA_HOME="/usr/lib/jvm/java-1.11.0-openjdk-amd64"
#!pip install sklearn

import jpype
print(jpype.isJVMStarted()) #return False:not running or 0:running

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from urllib.request import urlopen, urlretrieve
from urllib.error import URLError, HTTPError
from socket import error as SocketError
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize 
from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
import random
import errno
import os


def extract_feeling_kw(movie_id_list):
  feeling_kw_movie = dict()  # Save feeling keword list for each movie_id
  saved_movie_id_list = list()  # 리뷰가 충분하여 최종적으로 학습을 진행한 movie_id만을 저장
  stop_words = urlopen("https://raw.githubusercontent.com/nooblette/SentimentAnalysis/main/K_Stopword.txt")
  stop_words = list((str(stop_words.read(), 'utf-8').split('\n')))  # 불용어 단어 리스트
  stop_words += ['영화']  # 영화 리뷰 특화 불용어 단어 리스트 추가
  cwd = os.getcwd()  # 파일 저장 경로

  def rating_to_label(rating):
    if rating > 5:
        return 1
    else:
        return 0

  def apply_regular_expression(text):
    hangul = re.compile('[^ ㄱ-ㅣ 가-힣|0-9]')  # 한글 추출 규칙: 띄어 쓰기(1 개)를 포함한 한글
    result = hangul.sub('', text)  # 위에 설정한 "hangul"규칙을 "text"에 적용(.sub)시킴
    return result

  def remove_stopword(text):
    return [word for word in text if not word in stop_words]

  def text_cleaning(text):
    corpus = apply_regular_expression(text)  # 특수문자 제거
    okt = Okt()  # 명사 형태소 추출 함수
    corpus = okt.nouns(corpus)  # 명사만 추출
    corpus = [x for x in corpus if len(x) > 1]  # 두 글자 이상 단어만 사용
    corpus = remove_stopword(corpus)  # 불용어 제거
    return corpus

  movie_id_feeling_keywords = dict()  # 영화별 긍정, 부정 키워드 리스트 저장

  for i in movie_id_list:  # movie_id 리스트에 대해 진행
    try:
      movie_id = str(i)
      # ***************************** #
      # ****** 데이터 저장하기 ****** #
      # ***************************** #
      with urlopen("https://raw.githubusercontent.com/nooblette/SentimentAnalysis/main/new_review_data/" + movie_id + ".json") as url:

        review_json_file = url.read()
        review_json = json.loads(review_json_file)

        with open(cwd +"/new_review_data/" + movie_id + ".json", 'w') as outfile:
          outfile.write(json.dumps(review_json, ensure_ascii=False))
    
    except HTTPError as e:
      continue
    except SocketError as e:
      if e.errno != errno.ECONNRESET:
        raise # Not error we are looking for
      continue # Handle error here.


  for i in movie_id_list:  # 모든 영화에 대한 리뷰 데이터(.json)를 저장
    movie_id = str(i)
    # ***************************** #
    # ****** 데이터 가져오기 ****** #
    # ***************************** #
    with open(cwd + "/new_review_data/" + movie_id + ".json") as inputfile:
      review_json_file2 = inputfile.read()
      review_json = json.loads(review_json_file2)
      # file format : review, rating, movie_id = review_json[idx]['review'], review_json[idx]['rating'], review_json[idx]['movie_id']

      vect = CountVectorizer(tokenizer = lambda x: text_cleaning(x))
      bow_vect = vect.fit_transform(review['review'] for review in review_json)
      word_list = vect.get_feature_names_out()
      count_list = bow_vect.toarray().sum(axis=0)

      # "단어" - "총 등장 횟수" Matching
      word_count_dict = dict(zip(word_list, count_list))

      # ************************** #
      # ********* TF-IDF ********* #
      # ************************** #
      tfidf_vectorizer = TfidfTransformer()
      tf_idf_vect = tfidf_vectorizer.fit_transform(bow_vect)
      # row -> 한 리뷰를 의미, column -> 한 단어를 의미.

      invert_index_vectorizer = {v: k for k, v in vect.vocabulary_.items()}

      count = {1:0, 0:0}  # 긍정 리뷰와 부정 리뷰의 개수 저장
      for review in review_json:
        review['label'] = rating_to_label(int(review['rating']))
        count[review['label']] += 1

      # 긍정, 부정 리뷰 데이터가 각 7개 미만이면 충분하지 않다고 판단, 넘어가도록 진행
      # test_size를 전체 data의 30%를 이용하는데, 7*0.3 = 2.1
      if count[0] < 7 or count[1] < 7:
        continue

      saved_movie_id_list.append(int(movie_id))  # 리뷰 수가 충분한 영화의 id만을 저장

      x = tf_idf_vect
      y = [review['label'] for review in review_json]
      x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=1)

      '''# train, test data set size 확인
      print("movie_id :", movie_id, end = ' ')
      print("train data :", x_train.shape, len(y_train), end="  /  ")
      print("test data :", x_test.shape, len(y_test))'''



      # ***************************** #
      # ********* 학습 진행 ********* #
      # ***************************** #
      # fit in training set
      lr = LogisticRegression(random_state = 0)
      lr.fit(x_train, y_train)

      # predict in test set
      y_pred = lr.predict(x_test)



      # ***************************** #
      # ********* 모델 평가 ********* #
      # ***************************** #
      # classification result for test set

      #print('accuracy: %.2f' % accuracy_score(y_test, y_pred))
      #print('precision: %.2f' % precision_score(y_test, y_pred))

      '''# 예측 경향 출력
      confu = confusion_matrix(y_true = y_test, y_pred = y_pred)

      plt.figure(figsize=(4, 3))
      sns.heatmap(confu, annot=True, annot_kws={'size':15}, cmap='OrRd', fmt='.10g')
      plt.title('Confusion Matrix')
      plt.show()
      '''

      # ***************************************** #
      # ********* 긍정 부정 키워드 분석 ********* #
      # ***************************************** #
      coef_pos_index = sorted(((value, index) for index, value in enumerate(lr.coef_[0])), reverse = True)  # 긍정 키워드 리스트
      coef_neg_index = sorted(((value, index) for index, value in enumerate(lr.coef_[0])), reverse = False) # 부정 키워드 리스트

      invert_index_vectorizer = {v: k for k, v in vect.vocabulary_.items()}

      # 영화별로 키워드 리스트 저장
      movie_id_feeling_keywords[movie_id] = {
          'positive' : [[invert_index_vectorizer[coef[1]], coef[0]] for coef in coef_pos_index[:20] if coef[0] > 0.0],
          'negative' : [[invert_index_vectorizer[coef[1]], coef[0]] for coef in coef_neg_index[:20] if coef[0] < 0.0]
      }

  print("OUTPUT movie_id random")
  for i in saved_movie_id_list:
    i = str(i)
    print('--- movie_id is ', i)
    print('positive keywords', movie_id_feeling_keywords[i]['positive'])
    print('negative keywords', movie_id_feeling_keywords[i]['negative'])
    print('\n')

  '''
  # keyowrds list를 txt file로 저장
  f = open("keywords_list.txt", "w", encoding = 'utf-8')

  for movie_id, keywords in movie_id_feeling_keywords.items():
    positive = keywords['positive']
    negative = keywords['negative']
    f.write(f'{movie_id} : {positive}, {negative}\n')

  f.close()
  '''

# TEST
extract_feeling_kw([10001, 113635, 113648, 114226, 105224, 105239, 105246, 105249, 105257, 105275, 105289, 105329, 105389, 105501, 105520, 105521, 105530, 105548, 105559, 105563, 105579, 105755, 105774, 105783, 105854, 105880,105973, 106001, 106052, 106065])

# 참고한 코드
# https://hyemin-kim.github.io/2020/08/29/E-Python-TextMining-2/
# https://rfriend.tistory.com/475
# https://www.ranks.nl/stopwords/korean
# https://shakeratos.tistory.com/18

True
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
OUTPUT movie_id random
--- movie_id is  113635
positive keywords [['액션', 0.4583201306495161], ['최고', 0.3507215215012141], ['액션영화', 0.3434433481667756], ['진짜', 0.29328348628224216], ['스토리', 0.24907506997031775], ['최고다', 0.22934043385427524], ['정말', 0.20916171580872941], ['이영화', 0.19178022344754705], ['진심', 0.13981746687034102], ['마지막', 0.12971448966484664], ['리얼', 0.12625946870955598], ['지금', 0.11845382723963356], ['수준', 0.11700984797957967], ['장면', 0.11268491725325425], ['이건', 0.11134853158352889], ['스턴트맨', 0.11110971476614387], ['아저씨', 0.11090170840940633], ['정도', 0.10845237099658692], ['인도네시아', 0.10616940312120751], ['무술', 0.10333816650562754]]
negative keywords [['성우', -0.8651652306692494], ['시네마', -0.8553149134362792], ['평점', -0.7520160258166976], ['좀비', -0.690272991649727], ['주방', -0.6477820090255247], ['멕시코', -0.61176420145305], ['살해', -0.61176420145305], ['감성', -0.

# 새 섹션

In [None]:
""