In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [None]:
import re
from konlpy.tag import Okt

In [None]:
# 실습: 감성분석
df=pd.read_csv('./data/악플 데이터.csv')

In [None]:
# 전처리(한글): 클렌징, 토큰화, 불용어제거

def cleaning(text):
    p=re.compile('[^ a-zA-Zㄱ-ㅣ가-힣]+')
    result=p.sub('',text).lower()
    return result
df['content']=df['content'].apply(cleaning)

okt=Okt()
def okt_tokenizer(text):
    words=okt.pos(text, stem=True)
    filtered_words=[]
    for word, pos in words:
        if pos not in ['Josa']:
            filtered_words.append(word)
    return filtered_words

with open('./data/stopword.txt','r',encoding='utf-8') as f:
    word = f.read()
    stopwords=word.split('\n')

In [None]:
# 데이터셋 나누기- target, peatures, 학습, 테스트 sets

X_df= df['content']
y_df= df['target']

X_train, X_test, y_train, y_test= train_test_split(X_df, y_df, test_size=0.2, random_state=0)
print(X_train.shape,X_test.shape,y_test.shape)

In [None]:
# 벡터화, 적용 1. CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
cnt_vct=CountVectorizer(stop_words=stopwords, tokenizer=okt_tokenizer)
X_train_cnt=cnt_vct.fit_transform(X_train)
X_test_cnt=cnt_vct.transform(X_test)

In [None]:
# 모델 학습
from sklearn.linear_model import LogisticRegression
lr_clf=LogisticRegression(solver='liblinear', random_state=0)
lr_clf.fit(X_train_cnt, y_train)
pred=lr_clf.predict(X_test_cnt)
print('정확도 accuracy:', accuracy_score(y_test, pred))
print('정확도 f1 :',f1_score(y_test, pred))

In [None]:
# 벡터화, 적용 2. fidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vct=TfidfVectorizer(stop_words=stopwords, tokenizer=okt_tokenizer)
X_train_tf=tf_vct.fit_transform(X_train)
X_test_tf=tf_vct.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
lr_clf=LogisticRegression(solver='liblinear', random_state=0)
lr_clf.fit(X_train_tf, y_train)
pred=lr_clf.predict(X_test_tf)
print('정확도 accuracy:', accuracy_score(y_test, pred))
print('정확도 f1 :',f1_score(y_test, pred))

In [None]:
test_text = '욕나온다. 쓰레기'
predict = tf_vct.transform([test_text])
lr_clf.predict(predict)

In [None]:
#실습2    # 독립변수들이 원핫형식(희소행렬)

In [95]:
df=pd.read_csv('./data/unsmile_data.csv')  

In [96]:
y = df.iloc[:, 1:]
y_label = pd.DataFrame({'target': y.columns})

In [97]:
from sklearn.preprocessing import OneHotEncoder
oh_enc = OneHotEncoder()
oh_enc.fit(y_label)

In [98]:
y_oh = y[oh_enc.categories_[0]]
y_oh.drop([5876, 11942], inplace=True)
oh_enc.inverse_transform(y_oh)

array([['clean'],
       ['종교'],
       ['clean'],
       ...,
       ['인종/국적'],
       ['여성/가족'],
       ['남성']], dtype=object)

In [99]:
data = df[['문장']]
data.drop([5876, 11942], inplace=True)
data[['정답']] = oh_enc.inverse_transform(y_oh)
data

Unnamed: 0,문장,정답
0,일안하는 시간은 쉬고싶어서 그런게 아닐까,clean
1,아동성범죄와 페도버는 기록바 끊어져 영원히 고통 받는다. 무슬림 50퍼 근친이다. ...,종교
2,루나 솔로앨범 나왔을 때부터 머모 기운 있었음 ㅇㅇ Keep o doin 진짜 띵...,clean
3,홍팍에도 어버이연합인가 보내요 뭐 이런뎃글 있는데 이거 어버이연합측에 신고하면 그쪽...,clean
4,아놔 왜 여기 댓들은 다 여자들이 김치녀라고 먼저 불렸다! 여자들은 더 심하게 그런...,여성/가족
...,...,...
18737,저게 시대적언어면 한남충도 시대적언어 아니노 ㅋㅋ,남성
18738,다른것보다 눈이 어떻게 저렇게 생기노.탄식.,악플/욕설
18739,막노동을 해도 한국에살고말지 미쳤다고 남미를가냐?차라리 자살을하겠다.,인종/국적
18740,‘사형을 구형하였으나 여성인 점을 감안해 25년 선고’ ???내가 뭐 잘못본건가?개...,여성/가족


In [None]:
# 감정사전 함수 사용
def setiment_analyzer(text):
    import pandas as pd
    from konlpy.tag import Kkma
    from nltk.util import ngrams
    
    senti_words = pd.read_csv('./data/polarity.csv')
    kkma = Kkma()

    ngram1 = kkma.pos(text, join = True)
    ngram2 = list(ngrams(ngram1, n=2))
    new_ngram2 = []
    for n in ngram2:
        new_ngram2.append(';'.join(n))
    ngram3 = list(ngrams(ngram1, n=3))
    new_ngram3 = []
    for n in ngram3:
        new_ngram3.append(';'.join(n))

    words = ngram1 + new_ngram2 + new_ngram3

    result_df = senti_words[senti_words['ngram'].isin(words)]

    neg_df = result_df[result_df['max.value'] == 'NEG']
    pos_df = result_df[result_df['max.value'] == 'POS']
    neg_value = (neg_df['NEG'] / neg_df['freq']).sum()
    pos_value = (pos_df['NEG'] / pos_df['freq']).sum()
    neg_length = neg_df.shape[0]
    pos_length = pos_df.shape[0]

    if pos_length == 0:
        final_value = (pos_value ) - (neg_value / neg_length)
    elif neg_length == 0:
        final_value = (pos_value / pos_length) - (neg_value)
    else:
        final_value = (pos_value / pos_length) - (neg_value / neg_length)


    if final_value >= 0:
        print('긍정문장입니다.')
    else:
        print('부정문장입니다.')
    return final_value

In [None]:
text = df['content'].sample(1).iloc[0]
print(text)
setiment_analyzer(text)