In [24]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score

#다항분포 나이브베이즈를 위한 라이브러리를 임포트
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

np.random.seed(5)

# 문제 정의
* 영화 리뷰에 다항분포 나이브베이즈 분류를 활용해 영화 리뷰가 긍정적인지 부정적인지 분류하기

# 데이터 수집

In [25]:
# 감상의 경우 긍정과 부정의 의미를 담고 있는 단어가 따로 있음
# 단어의 개수까지 의미를 부여해 학습을 시킴 - > 정확도 높은 알고리즘 도출 

review_list = [
                {'movie_review': 'this is great great movie. I will watch again', 'type': 'positive'},
                {'movie_review': 'I like this movie', 'type': 'positive'},
                {'movie_review': 'amazing movie in this year', 'type': 'positive'},
                {'movie_review': 'cool my boyfriend also said the movie is cool', 'type': 'positive'},
                {'movie_review': 'awesome of the awesome movie ever', 'type': 'positive'},
                {'movie_review': 'shame I wasted money and time', 'type': 'negative'},
                {'movie_review': 'regret on this move. I will never never what movie from this director', 'type': 'negative'},
                {'movie_review': 'I do not like this movie', 'type': 'negative'},
                {'movie_review': 'I do not like actors in this movie', 'type': 'negative'},
                {'movie_review': 'boring boring sleeping movie', 'type': 'negative'}
             ]

df = pd.DataFrame(review_list)
df 

Unnamed: 0,movie_review,type
0,this is great great movie. I will watch again,positive
1,I like this movie,positive
2,amazing movie in this year,positive
3,cool my boyfriend also said the movie is cool,positive
4,awesome of the awesome movie ever,positive
5,shame I wasted money and time,negative
6,regret on this move. I will never never what m...,negative
7,I do not like this movie,negative
8,I do not like actors in this movie,negative
9,boring boring sleeping movie,negative


In [26]:
df['label']=df['type'].map({'positive':1, 'negative':0})
df

Unnamed: 0,movie_review,type,label
0,this is great great movie. I will watch again,positive,1
1,I like this movie,positive,1
2,amazing movie in this year,positive,1
3,cool my boyfriend also said the movie is cool,positive,1
4,awesome of the awesome movie ever,positive,1
5,shame I wasted money and time,negative,0
6,regret on this move. I will never never what m...,negative,0
7,I do not like this movie,negative,0
8,I do not like actors in this movie,negative,0
9,boring boring sleeping movie,negative,0


In [27]:
df_x = df['movie_review']
df_y = df['label']

In [28]:
# binary : 표시해줘라, 있으면 1, 없으면 0
# binary 없으면 누적해줌
cv = CountVectorizer()
x_traincv = cv.fit_transform(df_x)

In [29]:
cv.get_feature_names_out()

array(['actors', 'again', 'also', 'amazing', 'and', 'awesome', 'boring',
       'boyfriend', 'cool', 'director', 'do', 'ever', 'from', 'great',
       'in', 'is', 'like', 'money', 'move', 'movie', 'my', 'never', 'not',
       'of', 'on', 'regret', 'said', 'shame', 'sleeping', 'the', 'this',
       'time', 'wasted', 'watch', 'what', 'will', 'year'], dtype=object)

In [30]:
encoded_input = x_traincv.toarray()
encoded_input

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 2,
        0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0

In [31]:
print(cv.inverse_transform(encoded_input[[0]]))

[array(['again', 'great', 'is', 'movie', 'this', 'watch', 'will'],
      dtype='<U9')]


## 다항 분포 나이브베이즈 분류

In [32]:
# 기존의 데이터로 학습을 진행
mnb = MultinomialNB()

y_train = df_y.astype('int')
mnb.fit(x_traincv, y_train)


In [33]:
# 테스트 데이터 다듬기

test_feedback_list = [
                {'movie_review': 'great great great movie ever', 'type': 'positive'},
                {'movie_review': 'I like this amazing movie', 'type': 'positive'},
                {'movie_review': 'my boyfriend said great movie ever', 'type': 'positive'},
                {'movie_review': 'cool cool cool', 'type': 'positive'},
                {'movie_review': 'awesome boyfriend said cool movie ever', 'type': 'positive'},
                {'movie_review': 'shame shame shame', 'type': 'negative'},
                {'movie_review': 'awesome director shame movie boring movie', 'type': 'negative'},
                {'movie_review': 'do not like this movie', 'type': 'negative'},
                {'movie_review': 'I do not like this boring movie', 'type': 'negative'},
                {'movie_review': 'aweful terrible boring movie', 'type': 'negative'}
             ]

test_df = pd.DataFrame(test_feedback_list)
test_df['label']=test_df['type'].map({'positive':1, 'negative':0})

test_x = test_df['movie_review']
test_y = test_df['label']

x_testcv = cv.transform(test_x)

# 테스트

In [34]:
predicted = mnb.predict(x_testcv)

# 정확도(Accuracy)

In [35]:
accuracy_score(test_y, predicted)

1.0