#### 감성분석 -> 머신러닝
    - 데이터셋 : 전처리
    - BoW 모델
        - 단어를 특성 벡터로 변환
        - tf-idf 단어 적합성 평가
        - 텍스트 데이터 정제
        - 문서를 토큰으로 나누기
    - LogisticRegession

In [1]:
# 데이터 셋
# http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [2]:
import pandas as pd
from glob import glob
file_lists = glob('C:\\python_src2\\data\\movie\\train\\neg\\*.txt')
pd_lists = []
for file_path in file_lists[:500]:
    with open(file_path,'r',encoding='utf-8') as f:
        data = {
            'review' : f.read(),
            'target' : 0
        }
        df = pd.DataFrame([data])
        pd_lists.append(df)
train_neg_df = pd.concat(pd_lists,ignore_index=True)
train_neg_df.head()

Unnamed: 0,review,target
0,Story of a man who has unnatural feelings for ...,0
1,Airport '77 starts as a brand new luxury 747 p...,0
2,This film lacked something I couldn't put my f...,0
3,"Sorry everyone,,, I know this is supposed to b...",0
4,When I was little my parents took me along to ...,0


In [3]:
# postive  동일하게   train_pos_df
# train_df = pd.concat([train_nag_df, train_pos_df])
# movie_data.csv로 저장

file_lists = glob('C:\\python_src2\\data\\movie\\train\\pos\\*.txt')
pd_lists = []
for file_path in file_lists[:500]:
    with open(file_path,'r',encoding='utf-8') as f:
        data = {
            'review' : f.read(),
            'target' : 1
        }
        df = pd.DataFrame([data])
        pd_lists.append(df)
train_pos_df = pd.concat(pd_lists,ignore_index=True)
train_pos_df.head()

Unnamed: 0,review,target
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


In [4]:
train_df = pd.concat([ train_pos_df,  train_neg_df ],ignore_index=True)
train_df.head()

Unnamed: 0,review,target
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


In [5]:
train_df.to_csv("movie_data.csv", index=False, encoding='utf-8')

In [6]:
df = pd.read_csv('movie_data.csv')
df.head()

Unnamed: 0,review,target
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


In [7]:
# BoW(Bag of Words) 모델
# 문자를 숫자벡터
# 단어의 등장횟수를 카운트

# 전체 훈련데이터에서 모든 고유한 단어(토큰)로 어휘 사전
# 각 문서(리뷰데이터)를 사전을 기준으로 벡터화  N번째단어가 문서에서 3번나오면 벡터의 N번째값이 3이 된다.
# 문서1 : "나는 영화가 좋다"
# 문서2 : "나는 영화가 싫다"
# 사전 : {'나는':0, '영화가':1,'좋다':2,'싫다':3}
# 벡터화는 사전의 크기만큼 모든 문장의 길이를 동일하게
# 문서1벡터 : [0,1,2] - > [1,1,1,0]
# 문서2벡터 : [0,1,3] - > [1,1,0,1]

In [8]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
    "The sun is shining",
    "The weather is sweet"
])
bag = count.fit_transform(docs)

In [9]:
count.vocabulary_

{'the': 4, 'sun': 2, 'is': 0, 'shining': 1, 'weather': 5, 'sweet': 3}

In [10]:
# 문서 d에 등장한 단어 t의 횟수를 tf(t,d)
# BoW를 보완하면서 좀더 정교환 텍스트 벡터화 방식  TF-IDF(Term Frequency -Inverse Document Frequency)
# TF : 특정문서에서 자주 등장하는 단어
# IDF : 전체문서에 드물게 등장하는 단어
# 특정문서에서 자주 등장하지만 전체 문장에서 드물게 등장하는 단어에 높은 가중치를 부여 - 그 문장을 잘 대표하는 핵심 단어를 찾는다
# TF(t,d)  단어 t가 문장 d에 나타난 횟수 / 문서d의 모든 단어수
# IDF(t,D) : log( 총 문서수|D| / 단어 t를 포함한 문서의수 df(t) )  -- log 단어의 희귀성을 너무 과하게 반영하지 않도록 스케일링
# 분모에 + 1(사이킷런의 경우) :  분모가 0되는 것을 방지
# log( 1+ |D| / 1+df(f)  )

#TF-IDF(t,d,D) = TF(t,d) x IDF(t,D)

# "나는"
# TF : 리뷰에 3번 나옴 (높음)
# IDF : 전체 10,000개 리뷰중에 9000개 나옴(매우 낮음)
# tf-idf  높음 x 매우낮음 = 낮음(중요도가 낮음)

# "명작" TF : 리뷰에 2번 나옴 (높음)
# IDF : 전체 10,000개 리뷰중에 50개 나옴(매우 높음)
# tf-idf  높음 x 매우높음 = 높음(핵심단어)


In [11]:
df.review[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [12]:
# 데이터 정재... html tag 와 같은 불필요한 string이 보임... 특수기호 기타등등.  < - ' 
import re
def preprocessor(s):
    # 1. 영문, 공백, ., , 만 남기기
    clean = re.sub(r'[^A-Za-z\s.,]+', '', s)
    # 2. 연속된 마침표(...)를 마침표 하나로
    clean = re.sub(r'\.{2,}', '.', clean)
    # 3. 연속된 공백 정리
    clean = re.sub(r'\s+', ' ', clean).strip()
    return clean

In [13]:
df['review'] = df.review.apply(preprocessor)

In [14]:
# 문서를 토큰으로 나누기
# %pip install nltk
# %conda install -c conda-forge nltk

In [None]:
from nltk.stem.porter import PorterStemmer

def tokenizer(text):
    return text.split()

porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

# 어간추출 Stemming  단어의 접미사 -s -es -ing -ed 등등.. 를 강제로 제거해서 단어의 원형을 찾는과정
tokenizer(df.review[0][:100])

In [23]:
tokenizer('runners like running')

['runners', 'like', 'running']

In [34]:
# 불용어
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')    # 불용어 사전 다운로드
stops =  stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\playdata2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df.head()  # 전처리완료  정규식을 이용한..

Unnamed: 0,review,target
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness or Houselessness as George Carlin...,1
2,Brilliant overacting by Lesley Ann Warren. Bes...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


In [50]:
from sklearn.model_selection import train_test_split
X = df.review
y = df.target
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

In [45]:
# 어간추출(postStemmer) -> stopwor에 포함된 단어제거
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split() if word not in stops]

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [46]:
tfidf = TfidfVectorizer(
    tokenizer=tokenizer_porter,
    ngram_range=(1,1) # (1,1) 유니그램(unigramm, 단일단어)만 사용
)
pipeline = Pipeline([
    ('tfidf', tfidf),
    ('clf', LogisticRegression())
])

In [54]:
pipeline.fit(x_train,y_train)



0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,<function tok...002A0B2992DE0>
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [61]:
print(x_test.to_numpy()[0])
print(y_test.to_numpy()[0])

When I first saw A Cry in the Dark, I had no idea what the plot was. But when I saw it, I was shocked at what it portrayed. When I saw it a second time in an Australian Cinema class, I realized a second point communication issues. You see, when a dingo snatched Lindy Chamberlains Meryl Streep baby, she and her husband Michael Sam Neill were griefstricken but didnt show it. As Seventh Day Adventists, they believed that God willed this to happen, and so they couldnt mourn it. But when people all over Australia saw their lack of sadness, everyone started believing that Lindy did it herself.br br The point is, the wrong message got communicated to the public, and it turned people against Lindy. Even though this was a pure accident, it still happened. It may be one of the biggest disasters resulting from the existence of mass media, regardless of any media outlets political views.br br As for the performances, Streep does a very good job with an Australian accent no surprise there, and Sam 

In [73]:
pipeline.score(x_test,y_test)

0.935

In [None]:
# 원본데이터로드..
# 토크나이져 함수를 정의
    # 텍스트 전처리
    # 공백을 기준으로 단어단위로 분리
    # 영어는 전부 소문자로 변환
    # 어간 추출
    # 불용어 제거
# TFIDF를 정의
    # 토크나이져 매개변수 = 토크나이져 함수
    # ngram  (1,1)
# 파이프라인으로 tfidf, 머신러닝
# 파이프라이으로 학습
# 파이프라인으로 평가( classification_report)
# 과적합여부 확인

# train 폴더에 있는데이터로 학습 - 적당한 크기로
# test 폴더에 있는 문장으로 평가

In [93]:
from glob import glob
import numpy as np
from tqdm import tqdm

def get_data(pattern, neg=True,to = None):
    documents = []
    if to is not None:
        for path in tqdm(glob(pattern)[ : to]):
            with open(path, 'r', encoding='utf-8') as f:
                documents.append(  (np.array(f.read()),  0 if neg else 1)   )
    else:
        for path in tqdm(glob(pattern)):
            with open(path, 'r', encoding='utf-8') as f:
                documents.append(  (np.array(f.read()),  0 if neg else 1)   )
    return documents

# "../data/movie/train/neg/*.txt"        

In [None]:
train_neg_lists = get_data("../data/movie/train/neg/*.txt",True,5000)

100%|██████████| 5000/5000 [00:00<00:00, 10584.20it/s]


(array("Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.",
       dtype='<U655'),
 0)

In [None]:
# train_neg_lists = get_data("../data/movie/train/neg/*.txt",True)
train_neg_lists = get_data("../data/movie/train/pos/*.txt",False,5000)
test_neg_lists = get_data("../data/movie/test/neg/*.txt",True,5000)
test_neg_lists = get_data("../data/movie/train/pos/*.txt",False,5000)

 18%|█▊        | 896/5000 [00:00<00:00, 8558.15it/s]

In [81]:
import re
porter = PorterStemmer()
def custom_tokenizer(text):
    # 전처리
    # 1. 영문, 공백, ., , 만 남기기
    clean = re.sub(r'[^A-Za-z\s.,]+', '', text)
    # 2. 연속된 마침표(...)를 마침표 하나로
    clean = re.sub(r'\.{2,}', '.', clean)
    # 3. 연속된 공백 정리
    clean = re.sub(r'\s+', ' ', clean).strip()
    # 단어분리-어간분리-불용어제거
    return [porter.stem(word) for word in clean.split() if word not in stops]
tfidf = TfidfVectorizer(
    tokenizer=custom_tokenizer,
    ngram_range=(1,1),
    token_pattern=None
)
pipeline = Pipeline([
    ('tfid',tfidf),
    ('clf',LogisticRegression())
])

In [85]:
pipeline.fit(X,y)

0,1,2
,steps,"[('tfid', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,<function cus...002A0B788C900>
,analyzer,'word'
,stop_words,
,token_pattern,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [84]:
y

0      1
1      1
2      1
3      1
4      1
      ..
995    0
996    0
997    0
998    0
999    0
Name: target, Length: 1000, dtype: int64