In [1]:
pip install konlpy

Note: you may need to restart the kernel to use updated packages.




In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
train=pd.read_csv('downloads/train.csv')
test=pd.read_csv('downloads/test.csv')
submission = pd.read_csv('downloads/sample_submission.csv')

In [4]:
train=train.drop('ID', axis=1)
test=test.drop('ID', axis=1)

In [5]:
train.head()

Unnamed: 0,문장,유형,극성,시제,확실성,label
0,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실
1,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
2,정부가 고유가 대응을 위해 7월부터 연말까지 유류세 인하 폭을 30%에서 37%까지...,사실형,긍정,미래,확실,사실형-긍정-미래-확실
3,"서울시는 올해 3월 즉시 견인 유예시간 60분을 제공하겠다고 밝혔지만, 하루 만에 ...",사실형,긍정,과거,확실,사실형-긍정-과거-확실
4,익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실


In [6]:
train.columns = ['sentence', 'type', 'polarity', 'tense', 'certainty' , 'label']
test.columns=['sentence']

type_dict = {"사실형" : "Tr", "추론형" : "Guess", "대화형" : "Talk", "예측형" : "Predict"}
polarity_dict = {"긍정" : "Pos", "부정" : "Neg" , "미정" : "No"}
tense_dict = {"과거" : "Past", "현재" : "present", "미래" : "Future"}
certainty_dict = {"확실" : "Sure", "불확실" : "nosure"}

train['type'] = train['type'].apply(lambda x : type_dict[x])
train['polarity'] = train['polarity'].apply(lambda x : polarity_dict[x])
train['tense'] = train['tense'].apply(lambda x : tense_dict[x])
train['certainty'] = train['certainty'].apply(lambda x : certainty_dict[x])

def make_label(type,polarity,tense,certainty):
    return f"{type}-{polarity}-{tense}-{certainty}"

train['label_eng'] = train.apply(lambda x : make_label(x['type'],x['polarity'],x['tense'],x['certainty']), axis =1 , result_type = 'expand')

train

Unnamed: 0,sentence,type,polarity,tense,certainty,label,label_eng
0,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,Tr,Pos,present,Sure,사실형-긍정-현재-확실,Tr-Pos-present-Sure
1,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,Tr,Pos,Past,Sure,사실형-긍정-과거-확실,Tr-Pos-Past-Sure
2,정부가 고유가 대응을 위해 7월부터 연말까지 유류세 인하 폭을 30%에서 37%까지...,Tr,Pos,Future,Sure,사실형-긍정-미래-확실,Tr-Pos-Future-Sure
3,"서울시는 올해 3월 즉시 견인 유예시간 60분을 제공하겠다고 밝혔지만, 하루 만에 ...",Tr,Pos,Past,Sure,사실형-긍정-과거-확실,Tr-Pos-Past-Sure
4,익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다.,Tr,Pos,present,Sure,사실형-긍정-현재-확실,Tr-Pos-present-Sure
...,...,...,...,...,...,...,...
16536,"＇신동덤＇은 ＇신비한 동물사전＇과 ＇해리 포터＇ 시리즈를 잇는 마법 어드벤처물로, ...",Tr,Pos,Past,Sure,사실형-긍정-과거-확실,Tr-Pos-Past-Sure
16537,"수족냉증은 어릴 때부터 심했으며 관절은 어디 한 곳이 아니고 목, 어깨, 팔꿈치, ...",Tr,Pos,Past,Sure,사실형-긍정-과거-확실,Tr-Pos-Past-Sure
16538,김금희 소설가는 ＂계약서 조정이 그리 어려운가 작가를 격려한다면서 그런 문구 하나 ...,Tr,Pos,Past,Sure,사실형-긍정-과거-확실,Tr-Pos-Past-Sure
16539,1만명이 넘는 방문자수를 기록한 이번 전시회는 총 77개 작품을 넥슨 사옥을 그대로...,Tr,Pos,Past,nosure,사실형-긍정-과거-불확실,Tr-Pos-Past-nosure


In [7]:
from konlpy.tag import Okt
okt=Okt()
def text_preprocessing(text):
  stopwords=['을', '를', '이', '가', '은', '는', ',', '"']
  txt=re.sub('[^가-힣a-z]', ' ', text) #영어 소문자와 한글을 제외한 모든 문자 제거
  tokenizer=Okt()
  token=tokenizer.morphs(txt) #Okt를 이용해 형태소 분석
  token=[t for t in token if t not in stopwords] #형태소 분석기 거쳐 나온 결과들 중 stopwords 리스트에 포함되지 않는 토큰만 반환
  return token

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

#twitter 객체의 morphs() 객체를 이용한 tokenizer 사용. ngram_range=(1,2)
tfidf_vect=TfidfVectorizer(tokenizer=text_preprocessing, ngram_range=(1,2), min_df=3, max_df=0.9)
tfidf_vect.fit(train['sentence'])
tfidf_matrix_train=tfidf_vect.transform(train['sentence'])
#학습 데이터를 적용한 TfidfVectorizer를 이용해 테스트 데이터를 TF-IDF 값으로 피처 변환.
tfidf_matrix_test=tfidf_vect.transform(test['sentence'])



In [9]:
# 문장 벡터화한 것을 train, test에 넣어 최종 train, test 만들자
tfidf_matrix_train_frame=pd.DataFrame(tfidf_matrix_train.toarray())
tfidf_matrix_test_frame=pd.DataFrame(tfidf_matrix_test.toarray())
train_f=pd.concat([train, tfidf_matrix_train_frame], axis=1)

In [10]:
#로지스틱 회귀를 이용해 분류 수행
lg_clf=LogisticRegression(random_state=0)

#파라미터 C의 최적화를 위해 GridSearchCV 이용
params={'C':[3, 5.5, 8]}
grid_cv=GridSearchCV(lg_clf, param_grid=params, cv=3, scoring='f1_weighted', verbose=1)
grid_cv.fit(tfidf_matrix_train, train['tense'])
print(grid_cv.best_params_, round(grid_cv.best_score_, 4))

Fitting 3 folds for each of 3 candidates, totalling 9 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 8} 0.822


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## 1. tense 예측

In [11]:
#classifier는 gridsearchcv에서 최적 파라미터로 학습된 calssifier를 그대로 이용
best_estimator=grid_cv.best_estimator_
tense_preds=best_estimator.predict(tfidf_matrix_test)

In [12]:
test['tense']=tense_preds

In [13]:
test_f=pd.concat([test,tfidf_matrix_test_frame], axis=1).drop('sentence', axis=1)
test_f.head()

Unnamed: 0,tense,0,1,2,3,4,5,6,7,8,...,29960,29961,29962,29963,29964,29965,29966,29967,29968,29969
0,present,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,present,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Past,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Past,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Past,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
sub=pd.DataFrame(test_f.copy().iloc[:,0])
sub

Unnamed: 0,tense
0,present
1,present
2,Past
3,Past
4,Past
...,...
7085,present
7086,present
7087,present
7088,Future


In [28]:
test_f=pd.get_dummies(test_f, columns = ['tense'])
test_f.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29963,29964,29965,29966,29967,29968,29969,tense_Future,tense_Past,tense_present
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0


In [29]:
train_f=pd.get_dummies(train_f, columns=['tense'])
train_f.head()

Unnamed: 0,sentence,type,polarity,certainty,label,label_eng,0,1,2,3,...,29963,29964,29965,29966,29967,29968,29969,tense_Future,tense_Past,tense_present
0,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,Tr,Pos,Sure,사실형-긍정-현재-확실,Tr-Pos-present-Sure,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
1,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,Tr,Pos,Sure,사실형-긍정-과거-확실,Tr-Pos-Past-Sure,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
2,정부가 고유가 대응을 위해 7월부터 연말까지 유류세 인하 폭을 30%에서 37%까지...,Tr,Pos,Sure,사실형-긍정-미래-확실,Tr-Pos-Future-Sure,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
3,"서울시는 올해 3월 즉시 견인 유예시간 60분을 제공하겠다고 밝혔지만, 하루 만에 ...",Tr,Pos,Sure,사실형-긍정-과거-확실,Tr-Pos-Past-Sure,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
4,익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다.,Tr,Pos,Sure,사실형-긍정-현재-확실,Tr-Pos-present-Sure,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1


## 2. certainty 예측

In [30]:
grid_cv.fit(train_f.drop(['sentence', 'type', 'polarity', 'certainty', 'label', 'label_eng'], axis=1), train_f['certainty'])
print(grid_cv.best_params_, round(grid_cv.best_score_, 4))

Fitting 3 folds for each of 3 candidates, totalling 9 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 8} 0.9112


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
#classifier는 gridsearchcv에서 최적 파라미터로 학습된 calssifier를 그대로 이용
best_estimator=grid_cv.best_estimator_
certainty_preds=best_estimator.predict(test_f)



In [32]:
test_f['certainty']=certainty_preds
test_f.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29964,29965,29966,29967,29968,29969,tense_Future,tense_Past,tense_present,certainty
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,Sure
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,Sure
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,Sure
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,Sure
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,Sure


In [33]:
train_f.head()

Unnamed: 0,sentence,type,polarity,certainty,label,label_eng,0,1,2,3,...,29963,29964,29965,29966,29967,29968,29969,tense_Future,tense_Past,tense_present
0,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,Tr,Pos,Sure,사실형-긍정-현재-확실,Tr-Pos-present-Sure,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
1,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,Tr,Pos,Sure,사실형-긍정-과거-확실,Tr-Pos-Past-Sure,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
2,정부가 고유가 대응을 위해 7월부터 연말까지 유류세 인하 폭을 30%에서 37%까지...,Tr,Pos,Sure,사실형-긍정-미래-확실,Tr-Pos-Future-Sure,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
3,"서울시는 올해 3월 즉시 견인 유예시간 60분을 제공하겠다고 밝혔지만, 하루 만에 ...",Tr,Pos,Sure,사실형-긍정-과거-확실,Tr-Pos-Past-Sure,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
4,익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다.,Tr,Pos,Sure,사실형-긍정-현재-확실,Tr-Pos-present-Sure,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1


In [34]:
sub['certainty']=test_f['certainty']
sub

Unnamed: 0,tense,certainty
0,present,Sure
1,present,Sure
2,Past,Sure
3,Past,Sure
4,Past,Sure
...,...,...
7085,present,Sure
7086,present,Sure
7087,present,Sure
7088,Future,Sure


## 3. type 예측

In [35]:
#train, test 정의
train_f=pd.get_dummies(train_f, columns=['certainty'])
train_f.head()

Unnamed: 0,sentence,type,polarity,label,label_eng,0,1,2,3,4,...,29965,29966,29967,29968,29969,tense_Future,tense_Past,tense_present,certainty_Sure,certainty_nosure
0,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,Tr,Pos,사실형-긍정-현재-확실,Tr-Pos-present-Sure,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,1,1,0
1,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,Tr,Pos,사실형-긍정-과거-확실,Tr-Pos-Past-Sure,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0
2,정부가 고유가 대응을 위해 7월부터 연말까지 유류세 인하 폭을 30%에서 37%까지...,Tr,Pos,사실형-긍정-미래-확실,Tr-Pos-Future-Sure,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1,0,0,1,0
3,"서울시는 올해 3월 즉시 견인 유예시간 60분을 제공하겠다고 밝혔지만, 하루 만에 ...",Tr,Pos,사실형-긍정-과거-확실,Tr-Pos-Past-Sure,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0
4,익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다.,Tr,Pos,사실형-긍정-현재-확실,Tr-Pos-present-Sure,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,1,1,0


In [36]:
test_f=pd.get_dummies(test_f, columns = ['certainty'])
test_f.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29965,29966,29967,29968,29969,tense_Future,tense_Past,tense_present,certainty_Sure,certainty_nosure
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,1,1,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,1,1,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0


In [37]:
grid_cv.fit(train_f.drop(['sentence', 'type', 'polarity','label', 'label_eng'], axis=1), train_f['type'])
print(grid_cv.best_params_, round(grid_cv.best_score_, 4))

Fitting 3 folds for each of 3 candidates, totalling 9 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 8} 0.8309


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
#classifier는 gridsearchcv에서 최적 파라미터로 학습된 calssifier를 그대로 이용
best_estimator=grid_cv.best_estimator_
type_preds=best_estimator.predict(test_f)



In [39]:
test_f['type']=type_preds
test_f.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29966,29967,29968,29969,tense_Future,tense_Past,tense_present,certainty_Sure,certainty_nosure,type
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,1,1,0,Tr
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,1,1,0,Tr
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,1,0,1,0,Tr
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,1,0,1,0,Tr
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,1,0,1,0,Tr


In [40]:
sub['type']=test_f['type']
sub

Unnamed: 0,tense,certainty,type
0,present,Sure,Tr
1,present,Sure,Tr
2,Past,Sure,Tr
3,Past,Sure,Tr
4,Past,Sure,Tr
...,...,...,...
7085,present,Sure,Tr
7086,present,Sure,Guess
7087,present,Sure,Tr
7088,Future,Sure,Tr


## 4. polarity 예측

In [41]:
#train, test 정의
train_f=pd.get_dummies(train_f, columns=['type'])
train_f.head()

Unnamed: 0,sentence,polarity,label,label_eng,0,1,2,3,4,5,...,29969,tense_Future,tense_Past,tense_present,certainty_Sure,certainty_nosure,type_Guess,type_Predict,type_Talk,type_Tr
0,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,Pos,사실형-긍정-현재-확실,Tr-Pos-present-Sure,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,1,1,0,0,0,0,1
1,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,Pos,사실형-긍정-과거-확실,Tr-Pos-Past-Sure,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,1,0,1,0,0,0,0,1
2,정부가 고유가 대응을 위해 7월부터 연말까지 유류세 인하 폭을 30%에서 37%까지...,Pos,사실형-긍정-미래-확실,Tr-Pos-Future-Sure,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1,0,0,1,0,0,0,0,1
3,"서울시는 올해 3월 즉시 견인 유예시간 60분을 제공하겠다고 밝혔지만, 하루 만에 ...",Pos,사실형-긍정-과거-확실,Tr-Pos-Past-Sure,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,1,0,1,0,0,0,0,1
4,익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다.,Pos,사실형-긍정-현재-확실,Tr-Pos-present-Sure,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,1,1,0,0,0,0,1


In [42]:
test_f=pd.get_dummies(test_f, columns = ['type'])
test_f.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29969,tense_Future,tense_Past,tense_present,certainty_Sure,certainty_nosure,type_Guess,type_Predict,type_Talk,type_Tr
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,1,1,0,0,0,0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,1,1,0,0,0,0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,1,0,1,0,0,0,0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,1,0,1,0,0,0,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,1,0,1,0,0,0,0,1


In [43]:
grid_cv.fit(train_f.drop(['sentence', 'polarity','label', 'label_eng'], axis=1), train_f['polarity'])
print(grid_cv.best_params_, round(grid_cv.best_score_, 4))

Fitting 3 folds for each of 3 candidates, totalling 9 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 8} 0.95


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
#classifier는 gridsearchcv에서 최적 파라미터로 학습된 calssifier를 그대로 이용
best_estimator=grid_cv.best_estimator_
polarity_preds=best_estimator.predict(test_f)



In [45]:
sub['polarity']=polarity_preds
sub

Unnamed: 0,tense,certainty,type,polarity
0,present,Sure,Tr,Pos
1,present,Sure,Tr,Pos
2,Past,Sure,Tr,Pos
3,Past,Sure,Tr,Pos
4,Past,Sure,Tr,Pos
...,...,...,...,...
7085,present,Sure,Tr,Pos
7086,present,Sure,Guess,Pos
7087,present,Sure,Tr,Pos
7088,Future,Sure,Tr,Pos


In [46]:
type_dict = {"Tr" : "사실형", "Guess" : "추론형", "Talk" : "대화형", "Predict" : "예측형"}
polarity_dict = {"Pos" : "긍정", "Neg" : "부정" , "No" : "미정"}
tense_dict = {"Past" : "과거", "present" : "현재", "Future" : "미래"}
certainty_dict = {"Sure" : "확실", "nosure" : "불확실"}

sub['type'] = sub['type'].apply(lambda x : type_dict[x])
sub['polarity'] = sub['polarity'].apply(lambda x : polarity_dict[x])
sub['tense'] = sub['tense'].apply(lambda x : tense_dict[x])
sub['certainty'] = sub['certainty'].apply(lambda x : certainty_dict[x])

def make_label(type,polarity,tense,certainty):
    return f"{type}-{polarity}-{tense}-{certainty}"

sub['label'] = sub.apply(lambda x : make_label(x['type'],x['polarity'],x['tense'],x['certainty']), axis =1 , result_type = 'expand')

sub

Unnamed: 0,tense,certainty,type,polarity,label
0,현재,확실,사실형,긍정,사실형-긍정-현재-확실
1,현재,확실,사실형,긍정,사실형-긍정-현재-확실
2,과거,확실,사실형,긍정,사실형-긍정-과거-확실
3,과거,확실,사실형,긍정,사실형-긍정-과거-확실
4,과거,확실,사실형,긍정,사실형-긍정-과거-확실
...,...,...,...,...,...
7085,현재,확실,사실형,긍정,사실형-긍정-현재-확실
7086,현재,확실,추론형,긍정,추론형-긍정-현재-확실
7087,현재,확실,사실형,긍정,사실형-긍정-현재-확실
7088,미래,확실,사실형,긍정,사실형-긍정-미래-확실


In [48]:
submission['label']=sub['label']
submission

Unnamed: 0,ID,label
0,TEST_0000,사실형-긍정-현재-확실
1,TEST_0001,사실형-긍정-현재-확실
2,TEST_0002,사실형-긍정-과거-확실
3,TEST_0003,사실형-긍정-과거-확실
4,TEST_0004,사실형-긍정-과거-확실
...,...,...
7085,TEST_7085,사실형-긍정-현재-확실
7086,TEST_7086,추론형-긍정-현재-확실
7087,TEST_7087,사실형-긍정-현재-확실
7088,TEST_7088,사실형-긍정-미래-확실


In [50]:
submission.to_csv('submission_ys2.csv',index=False)