In [2]:
# 기본 라이브러리 설치
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer   # TF-IDF를 자동 계산


# col 생략 없이 출력
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv("nlp_data/practice.csv")
df.info()

df = df.drop('index',axis=1)

#df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54879 entries, 0 to 54878
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   54879 non-null  int64 
 1   text    54879 non-null  object
 2   score   54879 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.3+ MB


Unnamed: 0,text,score
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3
...,...,...
54874,"“Is that you, Mr. Smith?” odin whispered. “I h...",2
54875,"I told my plan to the captain, and between us ...",4
54876,"""Your sincere well-wisher, friend, and sister...",1
54877,“Then you wanted me to lend you money?”,3


In [4]:
# 인용문에 있는 쌍따옴표 제거
for idx, val in enumerate(df['text']):
    val = val.replace('“',"")
    val = val.replace('”','')
    val = val.replace('"','')
    df.loc[idx,'text'] = val

df

Unnamed: 0,text,score
0,"He was almost choking. There was so much, so m...",3
1,"Your sister asked for it, I suppose?",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"Have mercy, gentlemen! odin flung up his hands...",3
...,...,...
54874,"Is that you, Mr. Smith? odin whispered. I hard...",2
54875,"I told my plan to the captain, and between us ...",4
54876,"Your sincere well-wisher, friend, and sister,...",1
54877,Then you wanted me to lend you money?,3


In [5]:
# 스코어 고유값 확인
df['score'].unique()

# 스코어별로 구분 후 text만 저장한 df 생성
s0 = df.loc[df['score']==0]
s0 = s0[['text']]
s0 = s0.reset_index(drop=True)
s1 = df.loc[df['score']==1]
s1 = s1[['text']]
s1 = s1.reset_index(drop=True)
s2 = df.loc[df['score']==2]
s2 = s2[['text']]
s2 = s2.reset_index(drop=True)
s3 = df.loc[df['score']==3]
s3 = s3[['text']]
s3 = s3.reset_index(drop=True)
s4 = df.loc[df['score']==4]
s4 = s4[['text']]
s4 = s4.reset_index(drop=True)

In [6]:
s3

Unnamed: 0,text
0,"He was almost choking. There was so much, so m..."
1,"Have mercy, gentlemen! odin flung up his hands..."
2,"Not to pay him was impossible, considering his..."
3,Indeed she didn’t. By God I swear she didn’t c...
4,And why are you so dressed up? What a curious ...
...,...
15058,"Is that you, odin Yegorytch? asked odin."
15059,odin was waiting for him at the end of the pas...
15060,odin looked at him in silence.
15061,"Certainly I will be so good, gentlemen."


In [7]:
### 텍스트 전처리

In [8]:
# 단어 토큰화 - text_to_word_sequence 선택 (모두 소문자로 통일 + 구두점 제거)
import nltk
from tensorflow.keras.preprocessing.text import text_to_word_sequence
s1['text'] = s1.apply(lambda row: text_to_word_sequence(row['text']), axis=1)

> **Question1**   
여기 row의 기능? --> 매개변수 x와 같은 것. axis=1은 row로 적용

In [9]:
# 결과 확인
s1.head()

Unnamed: 0,text
0,"[she, was, engaged, one, day, as, she, walked,..."
1,"[it, suited, odin, best, to, think, odin, the,..."
2,"[the, bustle, in, the, vestibule, as, she, pas..."
3,"[oh, dear, no, said, her, companion]"
4,"[from, this, day, odin, grew, more, comfortabl..."


In [10]:
# 클리닝 - 불용어 제거
from nltk.corpus import stopwords
from konlpy.tag import Okt

In [11]:
# NLTK에서 불용어 확인하기
# stopwords.words("english") : NLTK가 정의한 영어 불용어 리스트를 리턴
stop_words_list = stopwords.words('english')
print('불용어 개수 :', len(stop_words_list))
print('불용어 10개 출력 :',stop_words_list[:10])

불용어 개수 : 179
불용어 10개 출력 : ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [12]:
# NLTK를 통해서 불용어 제거 + 단어 길이가 2이하인 경우 제거
stop = stopwords.words('english')
s1['text'] = s1['text'].apply(lambda x: [word for word in x if word not in (stop)])
s1['text'] = s1['text'].apply(lambda x: [word for word in x if len(word) > 2])

In [13]:
# 결과 확인
s1.head()

Unnamed: 0,text
0,"[engaged, one, day, walked, perusing, jane’s, ..."
1,"[suited, odin, best, think, odin, one, preferr..."
2,"[bustle, vestibule, passed, along, inner, lobb..."
3,"[dear, said, companion]"
4,"[day, odin, grew, comfortable, felt, friend, k..."


In [14]:
# 정규화 - 소문자 통일 : 위에서 완료.
# 정규화 - 표제어 추출
from nltk.stem import WordNetLemmatizer

s1['text'] = s1['text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])

In [15]:
# 결과 확인
s1.head()

Unnamed: 0,text
0,"[engage, one, day, walk, peruse, jane’s, last,..."
1,"[suit, odin, best, think, odin, one, prefer, a..."
2,"[bustle, vestibule, pass, along, inner, lobby,..."
3,"[dear, say, companion]"
4,"[day, odin, grow, comfortable, felt, friend, k..."


In [16]:
# 토큰화 완료된 문장을 시리즈에 저장
tokenized_doc = s1['text']
tokenized_doc

0       [engage, one, day, walk, peruse, jane’s, last,...
1       [suit, odin, best, think, odin, one, prefer, a...
2       [bustle, vestibule, pass, along, inner, lobby,...
3                                  [dear, say, companion]
4       [day, odin, grow, comfortable, felt, friend, k...
                              ...                        
7217    [mind, make, several, point, resolution, form,...
7218    [well, house, never, sit, anywhere, else, swee...
7219    [appear, presently, look, little, irritable, d...
7220    [dear, odin, talk, quite, idly, pray, would, b...
7221    [sincere, well, wisher, friend, sister, lucy, ...
Name: text, Length: 7222, dtype: object

# 빈도 기반(Countvector, TF-IDFvectorization) 토픽모델링(LDA) 
참고 : https://dianakang.tistory.com/51

In [17]:
### TF-IDF 행렬 만들기
# TfidfVectorizer는 기본적으로 토큰화가 되어있지 않은 텍스트 데이터를 입력으로 사용한다.
# 따라서 토큰화 작업을 역으로 취소하는 역토큰화 작업을 먼저 수행한다.

In [18]:
# 역토큰화 (토큰화 작업을 되돌림)
detokenized_doc = []
for i in range(len(s1)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

s1['text'] = detokenized_doc    # s1['text']에 다시 저장

In [19]:
# 결과 확인
s1.head()

Unnamed: 0,text
0,engage one day walk peruse jane’s last letter ...
1,suit odin best think odin one prefer account o...
2,bustle vestibule pass along inner lobby assure...
3,dear say companion
4,day odin grow comfortable felt friend kindness...


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features= 1000) # 상위 1,000개의 단어를 보존 
X = vectorizer.fit_transform(s1['text'])   # 말뭉치 학습 및 TF-IDF 완성!!
X

<7222x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 102786 stored elements in Compressed Sparse Row format>

In [21]:
# TF-IDF 행렬 확인

print(X.shape)        # TF-IDF 행렬의 크기 확인
print(X.toarray())    # TF-IDF 행렬 확인
# print(X.vocabulary_)  # 인덱싱된 단어 집합(?) 확인  :  오류남

(7222, 1000)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [22]:
### 토픽 모델링

In [23]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model=LatentDirichletAllocation(n_components=10,learning_method='online',random_state=777,max_iter=1)

In [24]:
lda_top=lda_model.fit_transform(X)

In [25]:
print(lda_model.components_.shape)   # components_의 shape 확인
print(lda_model.components_)         # components_의 속성값 확인

(10, 1000)
[[ 0.10845964  0.65241891  1.4522278  ...  0.13417724  0.10654141
   3.39756441]
 [ 6.88278946  4.77516893  5.2312935  ... 11.48733346  0.13762528
  17.45833624]
 [ 0.10458893 21.06876471  6.03302326 ... 26.52979377 16.41294751
  52.00577304]
 ...
 [ 0.11050087  0.10458754  0.10552298 ...  0.54538866  0.32738656
   0.23060054]
 [ 0.10451132  0.11719532  0.12215923 ... 12.03193766  0.10543599
   0.32571634]
 [ 0.10439871  0.10541486  0.1059895  ...  0.13734941  0.10794438
   0.19667739]]


In [26]:
terms = vectorizer.get_feature_names() # 단어 집합. 1,000개의 단어가 저장됨.
terms[:10]



['abbey',
 'able',
 'absence',
 'absent',
 'absolutely',
 'accept',
 'account',
 'acknowledge',
 'acquaint',
 'acquaintance']

In [27]:
# 토픽별로 가장 연관도가 높은 단어 5개씩 추출 + 연관도까지 같이 출력
def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(lda_model.components_, terms)

Topic 1: [('odin', 112.16), ('answer', 44.26), ('eye', 39.71), ('say', 37.03), ('little', 33.63)]
Topic 2: [('odin', 162.57), ('say', 63.49), ('ask', 56.14), ('mrs', 50.1), ('miss', 48.59)]
Topic 3: [('odin', 266.16), ('say', 98.22), ('think', 88.32), ('know', 87.4), ('good', 77.71)]
Topic 4: [('odin', 86.54), ('say', 80.24), ('room', 33.58), ('sir', 33.12), ('walk', 27.05)]
Topic 5: [('yes', 54.4), ('come', 25.8), ('reply', 19.55), ('lizzy', 13.19), ('perfectly', 12.99)]
Topic 6: [('odin', 72.94), ('tell', 35.0), ('say', 30.22), ('letter', 29.36), ('understand', 23.56)]
Topic 7: [('true', 22.66), ('colour', 21.8), ('bow', 15.98), ('assure', 13.62), ('remember', 13.2)]
Topic 8: [('smile', 36.37), ('mrs', 21.42), ('pray', 19.84), ('say', 19.71), ('odin', 17.13)]
Topic 9: [('shall', 43.53), ('read', 36.04), ('think', 35.92), ('sure', 27.94), ('odin', 27.43)]
Topic 10: [('afraid', 21.76), ('away', 17.77), ('add', 16.34), ('don', 14.7), ('agree', 11.43)]


------
------

# 워드임베딩

In [45]:
# Word2Vec 훈련시키기
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

model = Word2Vec(sentences=tokenized_doc, size=100, window=5, min_count=5, workers=4, sg=0)

In [48]:
# model.wv.most_similar : 입력한 단어에 대해서 가장 유사한 단어들을 출력
model_result = model.wv.most_similar("one")
print(model_result)

[('life', 0.999595046043396), ('years', 0.9993919134140015), ('five', 0.9993911981582642), ('society', 0.9993870854377747), ('family', 0.9993654489517212), ('year', 0.99935382604599), ('among', 0.9993314743041992), ('character', 0.9993167519569397), ('another', 0.9993091821670532), ('happiness', 0.9992923736572266)]


In [49]:
# Word2Vec 모델 저장하고 로드하기
model.wv.save_word2vec_format('eng_w2v') # 모델 저장
loaded_model = KeyedVectors.load_word2vec_format("eng_w2v") # 모델 로드

In [50]:
# 유사한 단어 다시 출력해보기
model_result = loaded_model.most_similar("man")
print(model_result)

[('woman', 0.990466833114624), ('people', 0.9884623289108276), ('ladies', 0.9876526594161987), ('men', 0.9854093194007874), ('young', 0.9842920899391174), ('good', 0.9831441044807434), ('agreeable', 0.98298579454422), ('sensible', 0.9825071096420288), ('love', 0.9822222590446472), ('sort', 0.9816296696662903)]


In [None]:
## fasttext
from gensim.models import FastText

model_FT = FastText(tokenized_doc, size=100, window=5, min_count=5, workers=4, sg=1)
model_FT.wv.most_similar("once")   # 유사한 단어를 계산해서 출력함

Exception in thread Exception in thread Exception in thread Thread-85:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\envs\data\lib\threading.py", line 980, in _bootstrap_inner
Exception in thread Thread-87:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\envs\data\lib\threading.py", line 980, in _bootstrap_inner
Thread-88:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\envs\data\lib\threading.py", line 980, in _bootstrap_inner
    self.run()
  File "C:\ProgramData\Anaconda3\envs\data\lib\threading.py", line 917, in run
Thread-86    self.run()
  File "C:\ProgramData\Anaconda3\envs\data\lib\threading.py", line 917, in run
:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\envs\data\lib\threading.py", line 980, in _bootstrap_inner
    self._target(*self._args, **self._kwargs)
  File "C:\ProgramData\Anaconda3\envs\data\lib\site-packages\gensim\models\base_any2vec.py", line 211, in _worker_loop
    self.run()
