In [1]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

comments_df = pd.read_excel('data/comments_preprocessed.xlsx')
comments_df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,contents,point,label
0,4,4,너무 웨이팅이 길어요노답,3,0
1,9,9,맑고 산뜻한걸 알겠으나 인상찌푸려질정도로 짜던데 웨이팅 하고나서 제일 실망한집 차...,1,0
2,11,11,호불호가 갈리는 라멘,3,0
3,13,13,토요일시반에 가서 웨이팅없었음 이런집은 애매한 시간을 노리시라 그치만 딱히 맛집이라...,3,0
4,19,19,차슈가 맛있고 국물은 깔끔한 맛으로 먹을만하긴 했는데 너무 기대해서 그런가 기대했던...,3,0


In [3]:
comments_df_filtered = comments_df[['contents', 'label']]
comments_df_filtered.head()

Unnamed: 0,contents,label
0,너무 웨이팅이 길어요노답,0
1,맑고 산뜻한걸 알겠으나 인상찌푸려질정도로 짜던데 웨이팅 하고나서 제일 실망한집 차...,0
2,호불호가 갈리는 라멘,0
3,토요일시반에 가서 웨이팅없었음 이런집은 애매한 시간을 노리시라 그치만 딱히 맛집이라...,0
4,차슈가 맛있고 국물은 깔끔한 맛으로 먹을만하긴 했는데 너무 기대해서 그런가 기대했던...,0


In [4]:
comments_df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1644 entries, 0 to 1643
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   contents  1644 non-null   object
 1   label     1644 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 25.8+ KB


In [5]:
comments_df_filtered.loc[comments_df_filtered['label'] == 1].shape[0]

822

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    comments_df_filtered['contents'],
    comments_df_filtered['label'],
    test_size=0.2,
    random_state=42,
    stratify=comments_df_filtered['label']
)

In [7]:
len(y_train)

1315

In [8]:
from tqdm.notebook import tqdm
from konlpy.tag import Okt

okt = Okt()
X_train_okt = []

for sentence in tqdm(X_train):
  # 형태소 분리 및 정규화, 어간추출
  temp_X = okt.morphs(sentence, stem=True, norm=True)
  X_train_okt.append(temp_X)

  0%|          | 0/1315 [00:00<?, ?it/s]

In [10]:
len(X_train_okt)

1315

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

X_train_joined = [ ' '.join(words) for words in X_train_okt ]
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_joined)
X_train_tfidf

<1315x3568 sparse matrix of type '<class 'numpy.float64'>'
	with 19612 stored elements in Compressed Sparse Row format>

In [12]:
len(tfidf_vectorizer.vocabulary_)

3568

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear'], 'penalty': ['l1', 'l2'], 'max_iter': [100, 200, 500]},
    {'C': [0.01, 0.1, 1, 10], 'solver': ['lbfgs'], 'penalty': ['l2'], 'max_iter': [100, 200, 500]},
]

lr = LogisticRegression()
grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train_tfidf, y_train)

best_lr = grid_search.best_estimator_

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [44]:
best_params_df = pd.DataFrame([grid_search.best_params_])
best_params_df.to_csv('best_params.csv', index=False)

In [46]:
X_test_okt = []

for sentence in tqdm(X_test):
  # 형태소 분리 및 정규화, 어간추출
  temp_X = okt.morphs(sentence, stem=True, norm=True)
  X_test_okt.append(temp_X)

  0%|          | 0/329 [00:00<?, ?it/s]

In [47]:
len(X_test_okt)

329

In [48]:
len(y_test)

329

In [49]:
X_test_joined = [ ' '.join(words) for words in X_test_okt ]
X_test_tfidf = tfidf_vectorizer.transform(X_test_joined)
X_test_tfidf

<329x3568 sparse matrix of type '<class 'numpy.float64'>'
	with 4579 stored elements in Compressed Sparse Row format>

In [50]:
from sklearn.metrics import accuracy_score

accuracy_score(best_lr.predict(X_test_tfidf), y_test)

0.7963525835866262

In [38]:
def sentiment_predict(sentence):
  sentence_norm_stem = okt.morphs(sentence, stem=True, norm=True)
  sentence_test = ' '.join(sentence_norm_stem)

  text_vector = tfidf_vectorizer.transform([sentence_test])

  pred = best_lr.predict(text_vector)
  print(sentence, "====>", pred)

In [41]:
txt = input()
sentiment_predict(txt)

 맛은 있으나 알바가 미숙함 첫 주문할때만 동일 메뉴 2인분 해야하는데 알바가 이부분 설명을 제대로 못해서 계속 2인분씩 주문했음


맛은 있으나 알바가 미숙함 첫 주문할때만 동일 메뉴 2인분 해야하는데 알바가 이부분 설명을 제대로 못해서 계속 2인분씩 주문했음 ====> [0]
