### TF-IDF Feature

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out'
TRAIN_CLEAN_DATA = 'train_clean.csv'

RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [3]:
train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

In [4]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [6]:
# TF-IDF로 벡터화
# min_df: 설정한 값보다 특정 토큰의 df 값이 적게 나오면 벡터화 과정에서 제거
# analyzer: 분석하기 위한 기준 단위(word: 단어 기준, char: 문자 기준)
# sublinear_tf: tf(문서 빈도수)에 대한 smoothing(0인 케이스가 나오지 않도록) 여부 설정
# ngram_range: 단어 묶음에 대한 범위 설정
# max_features: 벡터의 최대 길이
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", \
    sublinear_tf=True, ngram_range=(1,3), max_features=5000)

X = vectorizer.fit_transform(reviews)
y = np.array(sentiments)

In [7]:
X

<25000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 17862871 stored elements in Compressed Sparse Row format>

In [11]:
print(f"{round(1786287100 / (25000 * 5000), 2)}%의 비중으로 TF-IDF를 압축하여 저장")

14.29%의 비중으로 TF-IDF를 압축하여 저장


In [12]:
features = vectorizer.get_feature_names()



In [13]:
print(features)

[' ', ' a', ' aa', ' ab', ' ac', ' ad', ' ae', ' af', ' ag', ' ah', ' ai', ' ak', ' al', ' am', ' an', ' ap', ' ar', ' as', ' at', ' au', ' av', ' aw', ' ax', ' az', ' b', ' b ', ' ba', ' bb', ' be', ' bi', ' bl', ' bo', ' br', ' bu', ' by', ' c', ' c ', ' ca', ' ce', ' cg', ' ch', ' ci', ' cl', ' co', ' cr', ' cu', ' cy', ' d', ' da', ' de', ' di', ' do', ' dr', ' du', ' dv', ' dw', ' dy', ' e', ' e ', ' ea', ' eb', ' ec', ' ed', ' ee', ' ef', ' eg', ' ei', ' el', ' em', ' en', ' ep', ' eq', ' er', ' es', ' et', ' eu', ' ev', ' ex', ' ey', ' f', ' f ', ' fa', ' fb', ' fe', ' fi', ' fl', ' fo', ' fr', ' fu', ' fx', ' g', ' g ', ' ga', ' ge', ' gh', ' gi', ' gl', ' go', ' gr', ' gu', ' gw', ' gy', ' h', ' h ', ' ha', ' hb', ' he', ' hi', ' hm', ' ho', ' hu', ' hy', ' i', ' ia', ' ic', ' id', ' ig', ' ii', ' il', ' im', ' in', ' ir', ' is', ' it', ' iv', ' j', ' j ', ' ja', ' je', ' ji', ' jo', ' jr', ' ju', ' k', ' k ', ' ka', ' ke', ' kh', ' ki', ' kl', ' kn', ' ko', ' kr', ' ku', ' ky

In [15]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, \
    random_state=RANDOM_SEED)

In [16]:
# class_weight='balanced': 각 레이블에 대해 균형 있게 학습
lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

In [17]:
predicted = lgs.predict(X_eval)

In [18]:
print("Accuracy: %f" % lgs.score(X_eval, y_eval))

Accuracy: 0.859800


In [19]:
TEST_CLEAN_DATA = 'test_clean.csv'

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)

In [20]:
testDataVecs = vectorizer.transform(test_data['review'])

In [21]:
test_predicted = lgs.predict(testDataVecs)
print(test_predicted)

[1 0 1 ... 0 1 0]


In [22]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

answer_dataset = pd.DataFrame({'id': test_data['id'], 'sentiment': \
    test_predicted})

answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_tfidf_answer.csv', \
    index=False, quoting=3)