In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [None]:
import nltk
import re
from nltk import sent_tokenize, word_tokenize
from konlpy.tag import Okt, Hannanum, Kkma, Komoran
from nltk.stem import LancasterStemmer
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# 유사도 분석(코사인): 논문표절 , 추천시스템

In [None]:
def cos_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    l2_norm = (np.sqrt(sum(np.square(v1))) * np.sqrt(sum(np.square(v2))))
    similarity = dot_product / l2_norm 
    return similarity

In [None]:
doc_list = ['if you take the blue pill, the story ends' ,
            'if you take the red pill, you stay in Wonderland',
            'if you take the red pill']
tfidf_vect_simple = TfidfVectorizer(ngram_range=(1,2))        # ngram_range=(1,2)일때 유사도 더 올라가는지 볼것
feature_vect_simple = tfidf_vect_simple.fit_transform(doc_list)  
print(feature_vect_simple.shape)

In [None]:
feature_vect_array = feature_vect_simple.toarray()
vect1 = np.array(feature_vect_array[0]).reshape(-1,)
vect2 = np.array(feature_vect_array[1]).reshape(-1,)
similarity_simple = cos_similarity(vect1, vect2 )
print('문장1,문장2 코사인유사도:{0:.3f}'.format(similarity_simple))

In [None]:
vect1 = np.array(feature_vect_array[0]).reshape(-1,)
vect3 = np.array(feature_vect_array[2]).reshape(-1,)
similarity_simple = cos_similarity(vect1, vect3 )
print('문장1,문장3 코사인유사도:{0:.3f}'.format(similarity_simple))
vect2 = np.array(feature_vect_array[1]).reshape(-1,)
vect3 = np.array(feature_vect_array[2]).reshape(-1,)
similarity_simple = cos_similarity(vect2, vect3 )
print('문장2,문장3 코사인유사도:{0:.3f}'.format(similarity_simple))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_simple_pair = cosine_similarity(feature_vect_simple)  # 한개 데이터만 넣어도 작동
print(similarity_simple_pair)
print('shape:',similarity_simple_pair.shape)

In [None]:
pd.DataFrame(feature_vect_simple.toarray(), columns=tfidf_vect_simple.get_feature_names_out())

In [None]:
## 유사도_실습

In [None]:
df=pd.read_csv('./data/petition.csv')

In [None]:
import re
def cleaning(text):
    p = re.compile('[^ ㄱ-ㅣ가-힣]+')
    result = p.sub('',text)
    return result

In [None]:
df_cat=df[df['category']=='육아/교육']

In [None]:
df_cat['content']= df_cat['content'].apply(cleaning)

In [None]:
from konlpy.tag import Okt
okt = Okt()
def okt_tokenizer(text):
    tokens_ko = okt.morphs(text, stem = True)
    return tokens_ko

with open('./data/stopword.txt','r',encoding='utf-8') as f:  # okt에 적합함
    word = f.read()
    stopwords = word.split('\n')

In [None]:
tfidf_vect = TfidfVectorizer(max_df = 0.85, min_df = 2, tokenizer=okt_tokenizer, stop_words=stopwords, max_features=1000)       
feature_vect = tfidf_vect.fit_transform(df_cat['content'])  
print(feature_vect.shape)

In [None]:
similarity_pair = cosine_similarity(feature_vect)  
print(similarity_pair)
print('shape:',similarity_pair.shape)

In [None]:
df=pd.DataFrame(feature_vect.toarray(), columns=tfidf_vect.get_feature_names_out())

In [None]:
similarity_pair=cosine_similarity(feature_vect)

In [None]:
## 텍스트분석 실습 1. 영화 리뷰

In [None]:
df=pd.read_csv('./data/네이버 영화 리뷰.csv')
df['content'].head(30)

In [None]:
# 문서분류 (몇 점- 분류, 회귀)
# 감성분류 (나누기: 긍정 10~7점 중립 6~4 부정 3~1 ), 감성어휘사전 활용?
# 문서군집화(kMean, meanshift, dbscan. 가우시안) + 토픽 모델링(lda)
# 유사도 분석(cosine)

In [None]:
# 문서분류 (몇 점- 분류, 회귀)
#1. 클렌징/토큰/불용어/어근추출

In [None]:
df['content']=df['content'].apply(cleaning)

In [None]:
okt.pos(df['content'].iloc[16], stem=True)

In [None]:
okt = Okt()
def okt_tokenizer(text):
    words=okt.pos(text, stem=True)
    filtered_words=[]
    for word, pos in words:
        if pos not in ['Josa', 'KoreanParticle']:
            filtered_words.append(word)
    return filtered_words

In [None]:
with open('./data/stopword.txt','r',encoding='utf-8') as f:
    word = f.read()
    stopwords = word.split('\n')

In [None]:
cnt_vect=CountVectorizer(max_df=0.9, min_df=2, tokenizer=okt_tokenizer, stop_words=stopwords, max_features=5000)
cnt_vect.fit(df['content'])
X=cnt_vect.transform(df['content'])
X

In [None]:
result_df=pd.DataFrame(X.toarray(), columns=cnt_vect.get_feature_names_out())
result_df

In [None]:
y= df['score']
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.2, random_state=0 ) 

In [None]:
from sklearn.ensemble import RandomForestClassifier
lr_clf=LogisticRegression()
lr_clf.fit(X_train, y_train)

pred=lr_clf.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
## 불균형 데이터이므로 10점으로 찍음 ㅋ 10점
test_text ='영화를 보다가 잠들었어요'
pred=cnt_vect.transform([test_text])
rf_clf.predict(pred)

In [None]:
#

In [None]:
tfidf_vct=TfidfVectorizer(tokenizer=okt_tokenizer, stop_words=stopwords, max_features=1000)
tfidf_vct.fit(X_train)
X_train_tf=tfidf_vct.transform(X_train)
X_test_tf=tfidf_vct.transform(X_test)

In [None]:
tfidf_vect=TfidfVectorizer(max_df=0.9, min_df=2, tokenizer=okt_tokenizer, stop_words=stopwords, max_features=5000)
tfidf_vect.fit(df['content'])
X_tf=cnt_vect.transform(df['content'])
X_train, X_test, y_train, y_test=train_test_split(X_tf,y, test_size=0.2, random_state=0 ) 

In [None]:
print(X_train.shape, X_test.shape,y_train.shape, y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
lr_reg=LogisticRegression(solver='liblinear')
lr_reg.fit(X_train, y_train)
pred=lr_reg.predict(X_test)
print('예측 정확도 : ', np.sqrt(mean_squared_error(y_test, pred)))
print(r2_score(y_test, pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier  #**
rf_clf=RandomForestClassifier()
lr_clf.fit(X_train_tf, y_train)
pred=rf_clf.predict(X_test_tf)
accuracy_score(y_test, pred)

In [None]:
lr_clf=LogisticRegression()
lr_clf.fit(X_train, y_train)

pred=lr_clf.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.2, random_state=0 ) 

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
poly=PolynomialFeatures(degree=2)

X_train_poly=poly.fit_transform(X_train)
X_test_poly= poly.transform(X_test)

lr_reg=Ridge(alpha=1)
lr_reg.fit(X_train_poly, y_train)

pred=lr_reg.predict(X_test_poly)
print('예측 정확도 : ', np.sqrt(mean_squared_error(y_test, pred)))

In [None]:
# 감성분석(긍정, 중립, 부정)
good_df=df[df['score'] == 10].sample(frac=0.35)  #10점 많아 샘플링
bad_df=df[df['score'] <= 5]

In [None]:
good_df['score'] =1
bad_df['score'] = 0
df=pd.concat([good_df,bad_df], axis=0)

In [None]:
X=df['content']
y=df['score']
cnt_vect=CountVectorizer(max_df=0.9,tokenizer=okt_tokenizer, stop_words=stopwords, max_features=1000)
X_cnt=cnt_vect.fit_transform(X)

X_train, X_test, y_train, y_test=train_test_split(X_cnt,y, test_size=0.2, random_state=0 ) 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
lr_reg=LogisticRegression(solver='liblinear')
lr_reg.fit(X_train, y_train)
pred=lr_reg.predict(X_test)
print('예측 정확도 : ', np.sqrt(mean_squared_error(y_test, pred)))
print(r2_score(y_test, pred))

In [None]:
##내가 다시 해보자

In [None]:
#0.
df_review=pd.read_csv('./data/네이버 영화 리뷰.csv')
df_review['content']

In [None]:
#1.
import re
df['content'] = df['content'].apply(lambda x : re.sub('[^ ㄱ-ㅣ가-힣]+', '', x))

from konlpy.tag import Okt
okt = Okt()
def okt_tokenizer(text):   # 자르기 + 조사 삭제
    words=okt.pos(text, stem=True)
    filtered_words=[]
    for word, pos in words:
        if pos not in ['Josa']:
            filtered_words.append(word)
    return filtered_words

with open('data/stopword.txt','r',encoding='utf-8') as f:
    word = f.read()
    stopwords = word.split('\n')

In [None]:
# 감성분석(긍정, 중립, 부정)
good_df=df[df['score'] == 10].sample(frac=0.35)  #10점 많아 샘플링
bad_df=df[df['score'] <= 5]
good_df['score'] =1
bad_df['score'] = 0
df=pd.concat([good_df,bad_df], axis=0)

In [None]:
#2. 벡터화 count
X_train, X_test, y_train, y_test=train_test_split(df['content'],df['score'], test_size=0.2, random_state=0 ) 
cnt_vect=CountVectorizer(tokenizer=okt_tokenizer, stop_words=stopwords, max_features=1000)
cnt_vect.fit(X_train)
X_train_cnt=cnt_vect.transform(X_train)
X_test_cnt=cnt_vect.transform(X_test)

In [None]:
#3.
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score

rf_clf=RandomForestClassifier()
rf_clf.fit(X_train_cnt, y_train)
pred=rf_clf.predict(X_test_cnt)
print('예측 정확도 : ', np.sqrt(mean_squared_error(y_test, pred)))
print(r2_score(y_test, pred))

In [None]:
X_train, X_test, y_train, y_test=train_test_split(df['content'],df['score'], test_size=0.2, random_state=0 ) 
tf_vect=TfidfVectorizer(tokenizer=okt_tokenizer, stop_words=stopwords, max_features=1000)
tf_vect.fit(X_train)
X_train_tf=tf_vect.transform(X_train)
X_test_tf=tf_vect.transform(X_test)

In [None]:
rf_clf=RandomForestClassifier()
rf_clf.fit(X_train_tf, y_train)
pred=rf_clf.predict(X_test_tf)
print('예측 정확도 : ', np.sqrt(mean_squared_error(y_test, pred)))
print(r2_score(y_test, pred))

In [None]:
# 감성어휘사전 사용하기
centi_words=pd.read_csv('./data/polarity.csv')  # 꼬꼬마 형태소분서기 사용해야 함
centi_words['max.value'].unique()

In [None]:
centi_words[centi_words['ngram'].isin(result)]

In [None]:
from nltk.util import ngrams

ngram2=list(ngrams(result, n=3))
new_ngram2=[]
for n in ngram2:
    new_ngram2.append(':'.join(n))
print(new_ngram2)

ngram3=list(ngrams(result, n=3))
new_ngram3=[]
for n in ngram3:
    new_ngram3.append(':'.join(n))
print(new_ngram3)    

In [None]:
result_df=centi_words[centi_words['ngram'].isin(result)]

In [None]:
neg=result_df[result_df['max.value'] == 'NEG']['NEG'].sum()
pos=result_df[result_df['max.value'] == 'POS']['POS'].sum()

In [None]:
neg_length=result_df[result_df['max.value'] == 'NEG'].shape[0]
pos_length=result_df[result_df['max.value'] == 'POS'].shape[0]
final_value=(pos/pos_length) - (neg/neg_length)
final_value

In [None]:
ngram2=list(ngrams(result, n=3))
new_ngram2=[]
for n in ngram2:
    new_ngram2.append(':'.join(n))
print(new_ngram2)

ngram3=list(ngrams(result, n=3))
new_ngram3=[]
for n in ngram3:
    new_ngram3.append(':'.join(n))

In [None]:
words=ngram1+ ngram2 + ngram3


In [None]:
# 감정사전 함수 만들기(Rexicon)

In [None]:
result_df=centi_words[centi_words['ngram'].isin(result)]

In [None]:
neg_df=result_df[result_df['max.value'] == 'NEG']
pos_df=result_df[result_df['max.value'] == 'POS']
neg_value=neg_df['NEG'] /neg_df['NEG'].shape[0]
neg_value=neg_df['POS'] /neg_df['POS'].shape[0]
neg_length=result_df[result_df['max.value'] == 'NEG'].shape[0]
pos_length=result_df[result_df['max.value'] == 'POS'].shape[0]
final_value=(pos/pos_length) - (neg/neg_length)


In [None]:
text=input('하고픈 말 뭐임?')

In [None]:
if final_value >= 0:
    print('긍정문')
else:
    print('부정문')

In [None]:
# 군집 + 토픽모델  ==> 노래 가사 데이터

df=pd.read_csv('./data/rawdata_년도별 인기노래 가사.tsv', sep='\t')

In [None]:
df.info()

In [None]:
df[df['Title'].str.contains('사랑')]

In [None]:
df['Lyric']

In [None]:
def cleaning(text):
    p = re.compile('[^ a-zA-Zㄱ-\가-힣]+')
    result = p.sub('',text).lower()
    return result

In [None]:
X=df['Lyric'].apply(lambda x : re.sub('[^ a-zA-Zㄱ-ㅣ가-힣]+', '', x))

In [None]:
from nltk.corpus import stopwords
eng_stop=stopwords.words('english')
with open('data/stopword.txt','r',encoding='utf-8') as f:
    word = f.read()
    kor_stop = word.split('\n')

In [None]:
stopwords=eng_stop + kor_stop

In [None]:
def okt_tokenizer(text):   # 자르기 + 조사 삭제
    words=okt.pos(text, stem=True)
    filtered_words=[]
    for word, pos in words:
        if pos not in ['Josa']:
            filtered_words.append(word)
    return filtered_words

In [463]:
count_vect = CountVectorizer(max_df=0.85, min_df=3, max_features=1000, stop_words=stopwords, token_pattern = okt_tokenizer,ngram_range=(1,2))
X_cnt=cnt_vect.fit_transform(X)

In [None]:
cnt_vect.get_feature_names_out()

In [None]:
#### from sklearn.cluster import KMeans
kmeans=KMeans(n_clusters=5, max_iter=2000, random_state=0)
kmeans.fit(X_cnt)

In [None]:
pd.Series(kmeans.labels_).value_counts()

In [None]:
df=X.to_frame('Lyric')
df['cluster']=kmeans.labels_
df[df['cluster']==0]

In [None]:
df[df['cluster']==2]

In [None]:
from sklearn.mixture import GaussianMixture
gmm= GaussianMixture(n_components=5, random_state=0)
gmm.fit(X_cnt.toarray())

In [None]:
df['gmm']=gmm.predict(X_cnt.toarray())

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda=LatentDirichletAllocation(n_components=5)
lda.fit(X_cnt[df[df['cluster']==4].index])  # **

display_topics(lda,cnt_vect.get_feature_names_out(), 15)

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_index, topic in enumerate(model.components_):
        print('Topic #',topic_index)
        topic_word_indexes = topic.argsort()[::-1]
        top_indexes=topic_word_indexes[:no_top_words]
        feature_concat = ' '.join([feature_names[i] for i in top_indexes])
        print(feature_concat)
feature_names = count_vect.get_feature_names_out()
display_topics(lda, feature_names, 15)   

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda=LatentDirichletAllocation(n_components=7)
lda.fit(X_cnt[df[df['cluster']==0].index])  # **

display_topics(lda,cnt_vect.get_feature_names_out(), 15)

In [None]:
def get_cluster_details(cluster_model, cluster_data, feature_names, clusters_num, top_n_features=10):
    cluster_details = {}
    centroid_feature_ordered_ind = cluster_model.cluster_centers_.argsort()[:,::-1]
    for cluster_num in range(clusters_num):
        cluster_details[cluster_num] = {}
        cluster_details[cluster_num]['cluster'] = cluster_num
        top_feature_indexes = centroid_feature_ordered_ind[cluster_num, :top_n_features]
        top_features = [ feature_names[ind] for ind in top_feature_indexes ]
        top_feature_values = cluster_model.cluster_centers_[cluster_num, top_feature_indexes].tolist()
        cluster_details[cluster_num]['top_features'] = top_features
        cluster_details[cluster_num]['top_features_value'] = top_feature_values
    return cluster_details

In [None]:
def print_cluster_details(cluster_details):
    for cluster_num, cluster_detail in cluster_details.items():
        print('# Cluster {0}'.format(cluster_num))
        print('Top features:', cluster_detail['top_features'])
clust_centers = kmeans.cluster_centers_

# 위 두 함수 불러오기
feature_names = tfidf_vect.get_feature_names_out()
cluster_details = get_cluster_details(cluster_model=kmeans, cluster_data=df,feature_names=feature_names, clusters_num=4, top_n_features=10 )
print_cluster_details(cluster_details)

In [None]:
LatentDirichletAllocation(n_components=5)
lda.fit(X_cnt)

In [None]:
# 유사도 분석 : 코사인 유사도

from sklearn.metrics.pairwise import cosine_similarity

similary = cosine_similarity(X_cnt)
index=np.argsort(similary, axis=1)[:,:: -1]

In [None]:
df[df['Title'].str.contains('너의 의미')]

In [None]:
similary = cosine_similarity(X_cnt[1994], X_cnt)

In [None]:
np.argsort(similary, axis=1)[:,:: -1]

In [None]:
np.sort(similary, axis=1)[:,:: -1]

In [None]:
df.loc[1994, 'Lyric']

In [None]:
df.loc[678, 'Lyric']

In [None]:
# 다른 노래 밖에서
new_lyric='oh 나 왜 이래 널 바라보면 나 왜 이래 표정관리'
new_lyric=cleaning(new_lyric)
test=cnt_vect.transform(new_lyric)
similary = cosine_similarity(new_lyric, X_cnt)
index= np.argsort(similary, axis=1)[:,:: -1][0,0]
value=np.sort(similary, axis=1)[:,:: -1][0,0]
print(value)
df.loc[index, ['Title','Singer','Lyric']]


In [469]:
df_bad=pd.read_csv('./data/악플 데이터.csv')
df_bad.head(30)

Unnamed: 0,content,target
0,이종석 한효주 나오는 드라마 이후로 드라마 안봤다. 2년전인가?? 좀 신선했었지. ...,0
1,씨바알..노무노무 술프노... 오늘 저녁은 꽂등심이다ㅠㅜ,0
2,짱깨 꺼라ㅡ패쓰,0
3,그들의 사생활 ~ 고인이된 설리를 위해서라도 모두 조용하길 지금 누굴 탓한다고 무슨...,1
4,아무리 법이 뭣같아도 무슨 자격으로 개인의 신상정보를 불특정 다수에게 공개하는지 도...,1
5,다음도 들어와라하고...다른 의견내는 유튜버 목 자르고....추아줌마 꺼 여기저기서...,0
6,여자들도 아무한테나 자기야라고하는사람있는데 그것도성희롱인것같은데요,1
7,나경아 젖깐사진이나 인스타에 좀올려라 물좀빼게,0
8,어린시절 가정교육 못 받은 애들은 절대 그 본성을 숨길수없지,0
9,지연이 얼굴은 더 배우같네...,1


In [473]:
df=pd.read_csv('./data/unsmile_data.csv')

In [478]:
# 원핫인코딩 되어있는 것 같은 데// 종속변수드링 원핫처럼 배열 되어 있음

y=df.iloc[:,1:]
y_label=pd.DataFrame( {'target':y.columns})                 # 데이터프레임으로 만들어서 넣어주면 좋음

In [479]:
from sklearn.preprocessing import OneHotEncoder
oh_enc=OneHotEncoder()
oh_enc.fit(y_label)

In [482]:
oh_enc.categories_[0]

array(['clean', '개인지칭', '기타 혐오', '남성', '성소수자', '악플/욕설', '여성/가족', '연령',
       '인종/국적', '종교', '지역'], dtype=object)

In [484]:
y_oh=y[oh_enc.categories_[0]]

In [None]:
oh_en.invers

In [490]:
y_oh.drop([18740], inplace=True)

In [491]:
data= df[['문장']]
data[['정답']]=oh_enc.inverse_transform(y_oh)
data

ValueError: Length of values (18739) does not match length of index (18742)