In [75]:
import pandas as pd

review_df = pd.read_csv('./labeledTrainData.tsv', header=0, sep='\t', quoting=3)
# header = 0 : 파일의 첫 번째 줄이 열 이름
# quoting = 3 : 큰 따옴표 무시
review_df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [76]:
print(review_df['review'][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [77]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## str 클래스의 replace를 이용하여 \<br>태그를 공백(' ')으로 변환

In [78]:
review_df['review'] = review_df['review'].str.replace("<br />", '')
review_df['review']

0        "With all this stuff going down at the moment ...
1        "\"The Classic War of the Worlds\" by Timothy ...
2        "The film starts with a manager (Nicholas Bell...
3        "It must be assumed that those who praised thi...
4        "Superbly trashy and wondrously unpretentious ...
                               ...                        
24995    "It seems like more consideration has gone int...
24996    "I don't believe they made this film. Complete...
24997    "Guy is a loser. Can't get girls, needs to bui...
24998    "This 30 minute documentary Buñuel made in the...
24999    "I saw this movie as a child and it broke my h...
Name: review, Length: 25000, dtype: object

## 정규표현식을 이용하여 영어 문자열이 아닌 문자는 모두 공백으로 변환
* apply와 lambda 이용

In [79]:
import re
review_df['review'] = review_df['review'].apply(lambda x : re.sub('[^a-zA-Z]', '', x) )
review_df[['review']].head(10)

Unnamed: 0,review
0,WithallthisstuffgoingdownatthemomentwithMJives...
1,TheClassicWaroftheWorldsbyTimothyHinesisaverye...
2,ThefilmstartswithamanagerNicholasBellgivingwel...
3,Itmustbeassumedthatthosewhopraisedthisfilmtheg...
4,Superblytrashyandwondrouslyunpretentioussexplo...
5,IdontknowwhypeoplethinkthisissuchabadmovieItsg...
6,Thismoviecouldhavebeenverygoodbutcomesupwaysho...
7,IwatchedthisvideoatafriendshouseImgladIdidnotw...
8,Afriendofmineboughtthisfilmforandeventhenitwas...
9,ThismovieisfullofreferencesLikeMadMaxIIThewild...


## 학습/테스트 데이터 분리
* train_test_split 함수 이용
* 비율은 7:3
* random_state=156

In [80]:
from sklearn.model_selection import train_test_split

class_df = review_df['sentiment'] # target분리
feature_df = review_df.drop(['id', 'sentiment'], axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(feature_df, class_df, \
                                                   test_size=0.3, random_state=156)

## 파이프라인에 LogisticRegression 객체를 넣어 학습하고 예측
* CountVectorize 이용 
    * stop_words='english', ngram_range=(1,2)
* 성능평가 출력은 정확도를 사용하세요

In [81]:
pipeline = Pipeline([
    ('cnt_vect', CountVectorizer(stop_words='english', ngram_range=(1,2) )),
    ('lr_clf', LogisticRegression(C=10))
])

In [82]:
pipeline.fit(X_train['review'], y_train)

Pipeline(steps=[('cnt_vect',
                 CountVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('lr_clf', LogisticRegression(C=10))])

In [83]:
pred = pipeline.predict(X_test['review'])
print("예측 정확도는 {0:.4f}".format(accuracy_score(y_test, pred)))

예측 정확도는 0.4919
