In [17]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer

## 데이터 불러오기

In [18]:
# reviews_train = load_files("data/aclImdb/train/").keys()
reviews_train = load_files("data/aclImdb/train/")
text_train, y_train = reviews_train.data, reviews_train.target
text_train = [doc.replace(b"<br />",b" ") for doc in text_train]

## 텍스트 임베딩

In [19]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
feature_names = vect.get_feature_names_out()

In [20]:

feature_names[:10]
##이거는 너무 쓸모없는게 많음. 즉, 불용어를 처리해야함

array(['00', '000', '0000', '0000000000000000000000000000000001',
       '0000000000001', '000000001', '000000003', '00000001', '000001745',
       '00001'], dtype=object)

## Baseline

In [21]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

scores = cross_val_score(LogisticRegression(max_iter=1000), X_train, y_train, n_jobs=-1)
print(np.mean(scores))
##오래 걸림 주의 약 7분

KeyboardInterrupt: 

In [22]:
from sklearn.model_selection import GridSearchCV
param_grid = {"C":[0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(max_iter=5000), param_grid, n_jobs=-1)
grid.fit(X_train,y_train)
print(grid.best_score_)

KeyboardInterrupt: 

## 불용어처리

In [23]:
vect = CountVectorizer(min_df=5, stop_words="english").fit(text_train)
X_train = vect.transform(text_train)
feature_names = vect.get_feature_names_out()

In [24]:
from sklearn.model_selection import GridSearchCV
param_grid = {"C":[0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(max_iter=5000), param_grid, n_jobs=-1)
grid.fit(X_train,y_train)
print(grid.best_score_)

KeyboardInterrupt: 

In [None]:
# BOW는 계산이 쉬운 대신에, 데이터 편향이 있음
# tf-idf는 특정단어를 뒤로 보내서 별로
# n-gram은 하나의 문장단어가 확정되면, 뒤쪽 문장 파악이 쉬움 ex)네이버 연관검색어

## N-그램

In [25]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [26]:
pipe = make_pipeline(TfidfVectorizer(min_df=5), LogisticRegression(max_iter=100))

In [29]:
param_grid = {'logisticregression__C':[10,100],
              'tfidfvectorizer__ngram_range':[(1,1),(1,2),(1,3)]}

In [30]:
grid = GridSearchCV(pipe, param_grid, n_jobs=-1)
grid.fit(text_train, y_train)
grid.best_score_

KeyboardInterrupt: 

# 네이버 영화 리뷰를 활용한 감정 분석

> 1.KoNLP, 2.tf-idf를 활용한 임베딩
- 가능하면 해당 데이터를 기반으로 간단한 웹페이지도 만들어보자
- NLP 주요 라이브러리 설치
    - `pip install konlpy`  
    - `pip install joblib`  serialization(직렬화)
> `https://konlpy.org/ko/latest/install/#id2`

In [18]:
import numpy as np
import pandas as pd
import re

from konlpy.tag import Okt

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score

## 데이터불러오기

In [19]:
train_df = pd.read_table("data/ratings_train.txt")
test_df = pd.read_table("data/ratings_test.txt")

In [20]:
train_df.head(10)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
5,5403919,막 걸음마 뗀 3세부터 초등학교 1학년생인 8살용영화.ㅋㅋㅋ...별반개도 아까움.,0
6,7797314,원작의 긴장감을 제대로 살려내지못했다.,0
7,9443947,별 반개도 아깝다 욕나온다 이응경 길용우 연기생활이몇년인지..정말 발로해도 그것보단...,0
8,7156791,액션이 없는데도 재미 있는 몇안되는 영화,1
9,5912145,왜케 평점이 낮은건데? 꽤 볼만한데.. 헐리우드식 화려함에만 너무 길들여져 있나?,1


In [21]:
train_df.isnull().sum()

id          0
document    5
label       0
dtype: int64

## 전처리
- 불필요한거 삭제
- 불용어 처리

In [22]:
train_df = train_df.fillna(" ")
test_df = test_df.fillna(" ")

In [23]:
# 정규화를 통한 숫자 삭제
re.sub(r"\d+", "","123 456 하하하")

'  하하하'

In [24]:
train_df["document"] = train_df["document"].apply(lambda x:re.sub(r"\d+", " ",x))
test_df["document"] = test_df["document"].apply(lambda x:re.sub(r"\d+", " ",x))

## 임베딩

In [25]:
okt = Okt()

def tw_tokenizer(text):
    tokenizer_ko = okt.morphs(text)
    return tokenizer_ko

In [26]:
tfidf_vect = TfidfVectorizer(tokenizer=tw_tokenizer, ngram_range=(1,2), min_df=3, max_df=0.9)
# max_df=0.9 
tfidf_vect.fit(train_df["document"])
tfidf_matrix_train = tfidf_vect.transform(train_df["document"])



## 학습

In [10]:
lr = LogisticRegression(C=3.5, random_state=42) ##좌표평면에 백터로 나와있는 애들을 정리
lr.fit(tfidf_matrix_train, train_df["label"])

In [13]:
tfidf_matrix_test = tfidf_vect.transform(test_df["document"])

In [11]:
p_text = tfidf_vect.transform(["이 영화 재미있어요!"])

In [15]:
preds=lr.predict(tfidf_matrix_test)
accuracy_score(test_df["label"],preds)

0.86182