# 문장에서 일치하는 단어 찾기

20.02.17

문장으로 이루어진 original_text / voice_text가 있고, 단어로 이루어진 Meta / Keytalk가 있다. 이때 문장 속에서 일치하는 단어 목록을 찾아내면 되는 작업이다.   
  
Ex)  
  
문장 1: 나는 맛있는 사과를 먹는다.    
문장 2 : 그는 달콤한 바나나를 먹었다.   

단어 목록 : 사과, 바나나, 멜론, 맛있는, 달콤한, 새콤한    

문장 1에 대한 결과 : [맛있는, 사과]  
문장 2에 대한 결과 : [달콤한, 바나나]    



In [2]:
import pandas as pd
import re
import warnings
warnings.filterwarnings(action='ignore')
from tqdm import tqdm_notebook

## Preprocessing

#### 발화문

In [67]:
# 데이터 불러오기
sentence_1 = pd.read_excel('유사동의어작업.xlsx', sheet_name = 'aos 조합발화문_워싱대상제외', nrows = 8160)[['vt_pk', 'original_text', 'voice_text']]
sentence_2 = pd.read_excel('유사동의어작업.xlsx', sheet_name = 'AOS워싱키토크_조합발화문')[['vt_pk', 'original_text', 'voice_text']]

sentence = pd.concat([sentence_1, sentence_2], axis = 0).reset_index(drop = True)

# 인덱스 붙이기
sentence['No'] = sentence.index.values

# 소문자로
sentence['original'] = sentence['original_text'].str.lower()
sentence['voice'] = sentence['voice_text'].str.lower()

# . 제거
sentence['original'] = sentence['original_text'].str.replace('.', '')
sentence['voice'] = sentence['voice_text'].str.replace('.', '')

# 데이터 예시
sentence[['original', 'voice']].sample()

Unnamed: 0,original,voice
6538,Suggestions for some movies with a surreal ima...,suggestions for some movies with a surreal ima...


#### 메타

In [68]:
# 데이터 불러오기
meta = pd.read_csv('Meta.csv')

# 소문자로
meta['meta'] = meta['kl_name'].str.lower()
meta['meta'] = meta['meta'].str.replace('.', '')

# 데이터 예시
meta[['meta']].sample(3)

Unnamed: 0,meta
2259,vicellous reon shannon
2722,belén rueda
7511,howard mccain


#### 키토크

In [69]:
# 데이터 불러오기
key = pd.read_csv('Keytalk.csv')

# 소문자로
key['key'] = key['kl_name'].str.lower()
key['key'] = key['key'].str.replace('.', '')

# 키토크 고유값 뽑아내기
key_original = list(key['key'].unique())

# 데이터 예시
key[['key']].sample(3)

Unnamed: 0,key
3541,tenacious
10373,rather thin
3590,limited role


In [12]:
print('전체 키토크 수 : {}\n고유 키토크 수 : {}'.format(len(key['key']), len(key_original)))

전체 키토크 수 : 13458
고유 키토크 수 : 7999


---    

'단어 in 문장'을 이용   
  
But)   
문장1 : butterfly fly away  
단어 : butterfly, butter, fly, way  
  
내가 원하는 결과 : butterfly, fly  
'단어 in 문장'으로 했을 때 결과 : butterfly, butter, fly, way  
  
→ 애매하게 겹치는 단어도 매칭이 된다.      


&nbsp;   
★ 해결방안      
'공백 단어 공백'일 때 확실하게 들어있는 것은 1  
'단어 공백'&'공백 단어'일 때 확인해야하는 것은 2  
  
Ex)      
문장 1 : ' butterfly fly away '  
단어 : butterfly, butter, fly, way  
  
결과 : [[butterfly, 1], [butter, 2], [fly, 1], [way, 2]]  
  
→ 1로 나온거는 다 선택하고, 2로 나온건 확인한다.    
  

In [70]:
def wis(sent, word):   # word in sentence
    sent = list(sent.unique())
    word = list(word.unique())

    result = []
    
    for i in tqdm_notebook(word):
        for text in sent:
            if re.search(f' {i} ', text):
                result.append([i, text])
                
    result = pd.DataFrame(result)
    return result


def binder(df, oov, mok):
    binded = pd.DataFrame()

    if oov == 'original':
        sentence_unique = list(df['original'].unique())
    else:
        sentence_unique = list(df['voice'].unique())
        
    for i in tqdm_notebook(range(len(sentence_unique))):
        temp = pd.DataFrame({oov:sentence_unique[i], 
                            f'{mok}':[list(df[df[oov]==sentence_unique[i]]['kl_name'])]})
        binded = pd.concat([binded, temp], axis = 0)

    return binded

앞 뒤 공백 생성

In [71]:
sentence['original'] = ' ' + sentence['original']  + ' '
sentence['voice'] = ' ' + sentence['voice'] + ' '

### 메타 오리지널

문장 속에 있는 메타 찾기

In [72]:
meta_original = wis(sentence['original'], meta['meta'])

meta_original_2 = pd.DataFrame({'meta':meta_original[0], 'original':meta_original[1]})
meta_original_2 = pd.merge(meta_original_2, meta, on = 'meta', how = 'left')

HBox(children=(IntProgress(value=0, max=8049), HTML(value='')))




Sentence가 같은 값에 따라 meta값 묶어주기

In [73]:
original_meta = binder(meta_original_2, 'original', 'meta_o')

original_meta.sample(3)

HBox(children=(IntProgress(value=0, max=8334), HTML(value='')))




Unnamed: 0,original,meta_o
0,I want to see a go straight to dvd movie deal...,[kung fu]
0,a movie about a well-rounded character played...,[Alia Shawkat]
0,A comedy movie by will speck,"[Comedy, Will Speck]"


sentence와 합치기

In [74]:
sentence_result = pd.merge(sentence, original_meta, on = 'original', how = 'left')

---
### 메타 보이스

문장 속에 있는 메타 찾기

In [75]:
meta_voice = wis(sentence['voice'], meta['meta'])

meta_voice_2 = pd.DataFrame({'meta':meta_voice[0], 'voice':meta_voice[1]})
meta_voice_2 = pd.merge(meta_voice_2, meta, on = 'meta', how = 'left')

HBox(children=(IntProgress(value=0, max=8049), HTML(value='')))




Sentence가 같은 값에 따라 meta값 묶어주기

In [76]:
voice_meta = binder(meta_voice_2, 'voice', 'meta_v')

voice_meta.sample(3)

HBox(children=(IntProgress(value=0, max=2166), HTML(value='')))




Unnamed: 0,voice,meta_v
0,and about marriage movie released in 2001,"[2001, about marriage]"
0,Can you recommend me a type history movie,[History]
0,find a zombie movie with sense of dread,[zombie]


sentence와 합치기

In [77]:
sentence_result = pd.merge(sentence_result, voice_meta, on = 'voice', how = 'left')

In [78]:
sentence_result.to_excel('메타 결과_원본.xlsx', index = False)

--- 
### 키토크 오리지널  

문장 속에 있는 키토크 찾기

In [16]:
key_original = wis(sentence['original'], key['key'])

key_original_2 = pd.DataFrame({'key':key_original[0], 'original':key_original[1]})
key_original_2 = pd.merge(key_original_2, key, on = 'key', how = 'left')

HBox(children=(IntProgress(value=0, max=7999), HTML(value='')))




In [17]:
key_original_2.head()

Unnamed: 0,key,original,kl_pk,kl_name,kl_category
0,masterpiece,can you recommend me an artistic masterpiece ...,1.0,masterpiece,Opinion
1,masterpiece,can you recommend me an artistic masterpiece ...,,masterpiece,
2,masterpiece,something timeless masterpiece with a rating ...,1.0,masterpiece,Opinion
3,masterpiece,something timeless masterpiece with a rating ...,,masterpiece,
4,masterpiece,something epic masterpiece with a rating rott...,1.0,masterpiece,Opinion


Sentence가 같은 값에 따라 meta값 묶어주기

In [18]:
original_key = binder(key_original_2, 'original', 'key_o')

HBox(children=(IntProgress(value=0, max=6037), HTML(value='')))




In [19]:
original_key.sample(3)

Unnamed: 0,original,key_o
0,get me a list of movie that i can find a pred...,"[predictable ending, predictable ending, predi..."
0,which movie has a plodding pace badly paced,"[plodding, plodding, plodding pace, plodding p..."
0,something unique with a rating rotten tomato:...,"[unique, unique]"


sentence와 합치기

In [20]:
sentence_result = pd.merge(sentence_result, original_key, on = 'original', how = 'left')

--- 
### 키토크 보이스

문장 속에 있는 키토크 찾기

In [21]:
key_voice = wis(sentence['voice'], key['key'])

key_voice_2 = pd.DataFrame({'key':key_voice[0], 'voice':key_voice[1]})
key_voice_2 = pd.merge(key_voice_2, key, on = 'key', how = 'left')

HBox(children=(IntProgress(value=0, max=7999), HTML(value='')))




In [22]:
key_voice_2.head()

Unnamed: 0,key,voice,kl_pk,kl_name,kl_category
0,masterpiece,can you recommend me in artistic masterpiece ...,1.0,masterpiece,Opinion
1,masterpiece,can you recommend me in artistic masterpiece ...,,masterpiece,
2,masterpiece,something timeless masterpiece with a rating ...,1.0,masterpiece,Opinion
3,masterpiece,something timeless masterpiece with a rating ...,,masterpiece,
4,masterpiece,something epic masterpiece with a rating rott...,1.0,masterpiece,Opinion


Sentence가 같은 값에 따라 meta값 묶어주기

In [23]:
voice_key = binder(key_voice_2, 'voice', 'key_v')

HBox(children=(IntProgress(value=0, max=4928), HTML(value='')))




In [24]:
voice_key.sample(3)

Unnamed: 0,voice,key_v
0,can you recommend me in action orientated his...,"[action orientated, action orientated]"
0,a movie where pat short place forlorn character,"[forlorn, forlorn]"
0,which movie has a repellent in splot,"[repellent, repellent]"


sentence와 합치기

In [25]:
sentence_result = pd.merge(sentence_result, voice_key, on = 'voice', how = 'left')

### 내보내기

In [26]:
sentence_result[['original_text', 'meta_o', 'key_o', 'voice_text', 'meta_v', 'key_v']].sample(3)

Unnamed: 0,original_text,meta_o,key_o,voice_text,meta_v,key_v
7216,Find an anguilla movie with handsome photograph,[Anguilla],"[handsome, handsome, handsome photograph]",find an angle a movie with handsome photograph,,"[handsome, handsome, handsome photograph]"
2747,Can you recommend me an impelling and reinforc...,[Comedy],"[impelling and reinforcing, impelling and rein...",Can you recommend me in impelling and reinforc...,[Comedy],"[impelling and reinforcing, impelling and rein..."
5389,Something less compelling with a rating rotten...,[Rotten Tomato: 100%],"[compelling, compelling, less compelling, less...",Something less compelling with a rating Rotten...,,"[compelling, compelling, less compelling, less..."


In [27]:
sentence_result.to_excel('result.xlsx', index = False, encoding = 'utf-8')