In [51]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [53]:
import pandas as pd

comments_df = pd.read_excel('../preprocessing/data/comments_preprocessed.xlsx')
comments_df.shape

(2894, 5)

In [54]:
comments_df_filtered = comments_df[['contents', 'label']]
comments_df_filtered.head()

Unnamed: 0,contents,label
0,다 알바임밥은 말랐고 회는 하 오랜만이네 점,0
1,친절하신데ㅜ 초밥 맛은 그냥,0
2,가장 먹을만한게 활어라 활어를 번 시켜먹었는데 그만 먹길 바라셨는지 제가 활어를 또...,0
3,사장님 친절하시구 걍 회전초밥 느낌임 기대 안하고 가면 ㄱㅊ 주변에 이정도 가성비 ...,0
4,적당히 무난히 맛있음 후토마키는 너무 맛있음,0


In [55]:
comments_df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2894 entries, 0 to 2893
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   contents  2894 non-null   object
 1   label     2894 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 45.3+ KB


In [56]:
comments_df_filtered.loc[comments_df_filtered['label'] == 1].shape[0]

1447

In [57]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    comments_df_filtered['contents'],
    comments_df_filtered['label'],
    test_size=0.2,
    random_state=42,
    stratify=comments_df_filtered['label']
)

In [58]:
len(y_train)

2315

In [59]:
from tqdm.notebook import tqdm
from konlpy.tag import Okt

okt = Okt()
X_train_okt = []

for sentence in tqdm(X_train):
  # 형태소 분리 및 정규화, 어간추출
  temp_X = okt.morphs(sentence, stem=True, norm=True)
  X_train_okt.append(temp_X)

  0%|          | 0/2315 [00:00<?, ?it/s]

In [60]:
len(X_train_okt)

2315

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

X_train_joined = [ ' '.join(words) for words in X_train_okt ]
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_joined)
X_train_tfidf

<2315x5326 sparse matrix of type '<class 'numpy.float64'>'
	with 37295 stored elements in Compressed Sparse Row format>

In [62]:
len(tfidf_vectorizer.vocabulary_)

5326

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear'], 'penalty': ['l1', 'l2'], 'max_iter': [100, 200, 500]},
    {'C': [0.01, 0.1, 1, 10], 'solver': ['lbfgs'], 'penalty': ['l2'], 'max_iter': [100, 200, 500]},
]

lr = LogisticRegression()
grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train_tfidf, y_train)

best_lr = grid_search.best_estimator_

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [64]:
best_params_df = pd.DataFrame([grid_search.best_params_])
best_params_df.to_csv('best_params.csv', index=False)

In [65]:
X_test_okt = []

for sentence in tqdm(X_test):
  # 형태소 분리 및 정규화, 어간추출
  temp_X = okt.morphs(sentence, stem=True, norm=True)
  X_test_okt.append(temp_X)

  0%|          | 0/579 [00:00<?, ?it/s]

In [66]:
len(X_test_okt)

579

In [67]:
len(y_test)

579

In [68]:
X_test_joined = [ ' '.join(words) for words in X_test_okt ]
X_test_tfidf = tfidf_vectorizer.transform(X_test_joined)
X_test_tfidf

<579x5326 sparse matrix of type '<class 'numpy.float64'>'
	with 8667 stored elements in Compressed Sparse Row format>

In [69]:
from sklearn.metrics import accuracy_score

accuracy_score(best_lr.predict(X_test_tfidf), y_test)

0.8238341968911918

In [70]:
def sentiment_predict(sentence):
  sentence_norm_stem = okt.morphs(sentence, stem=True, norm=True)
  sentence_test = ' '.join(sentence_norm_stem)

  text_vector = tfidf_vectorizer.transform([sentence_test])

  pred = best_lr.predict(text_vector)
  print(sentence, "====>", pred)

In [74]:
txt = input()
sentiment_predict(txt)

 웨이팅은 길어서 힘들었지만 정말 맛있습니다 가치가 있음


웨이팅은 길어서 힘들었지만 정말 맛있습니다 가치가 있음 ====> [1]
