<a href="https://www.kaggle.com/ominiv/sentiment-analysis-unsupervised-and-supervised?scriptVersionId=86282593" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

**Sentiment Analysis |  unsupervised and supervised**

## UnZip Files

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

zip_file_path = ['/kaggle/input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip','/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip','/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip']
output_path = '/kaggle/working/'

import zipfile
for path in zip_file_path :
    with zipfile.ZipFile(path,'r') as zip_ref:
        zip_ref.extractall(output_path)


## EDA


In [None]:
review_df = pd.read_csv('./../working/labeledTrainData.tsv',sep='\t')
review_df.head()
# review_df.review[:10]

In [None]:
import re
# <br> -> blank
review_df['review'] = review_df['review'].str.replace('<br />',' ')

# 영어가 아닌 경우 공백으로 변환
review_df['review'] = review_df['review'].apply(lambda x : re.sub('[^a-zA-Z]',' ', x))

print(review_df.head());print('='*50)

In [None]:
import matplotlib.pyplot as plt

target_cnt = review_df['sentiment'].value_counts()
plt.bar(x=target_cnt.index, height=target_cnt.values)

## Train / Test dataset 생성

In [None]:
from sklearn.model_selection import train_test_split
class_df = review_df[['sentiment']]
feature_df = review_df.drop(['id','sentiment'], axis=1)

X_train, X_test ,y_train, y_test = train_test_split(feature_df, class_df, test_size=0.3, random_state=156)
print(X_train.shape, X_test.shape ,y_train.shape, y_test.shape)

## 지도학습 기반 감성분석 소개
- CountVectorizer : accuracy : 0.886133 / ROC-AUC :0.950269
- **TfidfVectorizer : accuracy : 0.893600 / ROC-AUC :0.959801**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# CountVectorizer
pipeline = Pipeline([('cnt_vect',CountVectorizer(stop_words='english',ngram_range=(1,2))),
                    ('lr_clf',LogisticRegression(solver='liblinear',C=10))])

pipeline.fit(X_train['review'],y_train)
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:,1]
print('CountVectorizer')
print('accuracy : {0:4f} / ROC-AUC :{1:4f}'.format(accuracy_score(pred,y_test),roc_auc_score(y_test,pred_probs)));print('='*50)

# TfidfVectorizer
pipeline = Pipeline([('tfidf_vect',TfidfVectorizer(stop_words='english',ngram_range=(1,2))),
                    ('lr_clf',LogisticRegression(solver='liblinear',C=10))])

pipeline.fit(X_train['review'],y_train)
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:,1]
print('TfidfVectorizer')
print('accuracy : {0:4f} / ROC-AUC :{1:4f}'.format(accuracy_score(pred,y_test),roc_auc_score(y_test,pred_probs)))

## 비지도학습 기반 감성분석 소개 `lexicon`
### SentiWordNet / VADER
- SentiWordNet accuracy :  0.6101
- VADER accuracy :  0.6920

### SentiWordNet을 이용한 영화 감상평 감성분석

In [None]:
import nltk
nltk.download('all')

In [None]:
from nltk.corpus import wordnet as wn
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag

# 문장 - 단어토큰 - 품사 - 감성지수 계산
def swn_polarity(text):
    # 감성지수 초기화
    sentiment = 0
    tokens_cnt = 0
    
    lemmatizer = WordNetLemmatizer()
    raw_sentences = sent_tokenize(text)
    # 분해된 문장 별 단어토큰화& 품사태깅후 SentiSynSet 생성 -> 긍정/부정 점수 합산
    for row_sentence in raw_sentences:
        # NTLK기반 품사 태깅 문장 추출
        tagged_sentence = pos_tag(word_tokenize(row_sentence))
        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            # WordNet 기반 품사 태깅과 어근 추출
            if wn_tag not in (wn.ADJ, wn.NOUN, wn.ADV, wn.VERB):
                continue
            lemma = lemmatizer.lemmatize(word,pos=wn_tag)
            if not lemma :
                continue
            # 어근 추출한 단어와 WordNet 기반 품사 태깅을 입력해 Synset 객체 생성
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
            # 감성 단어 분석으로 감성 Synset 추출
            # 긍정은 + 부정은 - 로 점수 합산
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment += (swn_synset.pos_score() - swn_synset.neg_score())
            tokens_cnt +=1
    if not tokens_cnt :
        return 0 
    if sentiment >= 0 : # 0보다 클경우 긍정 
        return 1
    return 0
    

In [None]:
train_df = pd.DataFrame(X_train) 
train_df['pred'] = train_df['review'].apply(lambda x: swn_polarity(x))


### VADER를 이용한 감성분석
polarity_scores 함수 덕에 sentiwordNet 보다 쉽게 감성 분석을 할 수 있다. <br> `neg` : 부정 `neu`: 중립 `pos` : 긍정 `compound` 부정/중립/긍정을 조합해서 -1~ 1사이값으로 나타냄

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
senti_analyzer = SentimentIntensityAnalyzer()
senti_scores = senti_analyzer.polarity_scores(train_df['review'][0])
print(senti_scores)

In [None]:
def vader_polarity(review, thres = 0.1):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    
    agg_score = scores['compound']
    final_sentiment = 1 if agg_score >= thres else 0
    return final_sentiment

train_df['vader_pred']= train_df['review'].apply(lambda x : vader_polarity(x,0.1))

In [None]:
pred = train_df['pred'].values
print('SentiWordNet accuracy : {0: .4f}'.format(accuracy_score(pred,y_train)))
pred = train_df['vader_pred'].values
print('VADER accuracy : {0: .4f}'.format(accuracy_score(pred,y_train)))


## 결론 
비지도 학습이 지도학습보다 예측성능이 떨어지는 것을 확인 할 수 있다. 결정 클래스 값이 없는 상황이라면 비지도 학습을 고려해보자.