In [86]:
import re
import time
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import os
import tqdm
import pandas as pd
import numpy as np
import pickle

Read File

In [7]:
file_name = 'WHO_FAQ.csv'
df = pd.read_csv(file_name)
df

Unnamed: 0,Context,Answer
0,What are the symptoms of COVID-19?,The most common symptoms of COVID-19 are fever...
1,What are the symptoms of COVID-19?,Some people become infected but don鈥檛 develop ...
2,What are the symptoms of COVID-19?,Around 1 out of every 6 people who gets COVID-...
3,What are the symptoms of COVID-19?,"Older people, and those with underlying medica..."
4,How does COVID-19 spread?,People can catch COVID-19 from others who have...
5,How does COVID-19 spread?,People can also catch COVID-19 if they breathe...
6,Can the virus that causes COVID-19 be transmit...,Studies to date suggest that the virus that ca...
7,Can CoVID-19 be caught from a person who has n...,The main way the disease spreads is through re...
8,Can CoVID-19 be caught from a person who has n...,Many people with COVID-19 experience only mild...
9,Can I catch COVID-19 from the feces of someone...,The risk of catching COVID-19 from the feces o...


取问题作为单独的list(用来训练similarity模型)

In [19]:
questions_raw = list(set(df['Context']))
questions_raw[:5]

['Are there any medicines or therapies that can prevent or cure COVID-19?',
 'Are smokers and tobacco users at higher risk of COVID-19 infection?',
 'Is COVID-19 the same as SARS?',
 'Who is at risk of developing severe illness?',
 'What medical interventions are available for COVID-19 and influenza viruses?']

创建问题-答案序号对，以及答案序号-答案对。  
这样处理的话，当用户键入问题时，会定位到问题库(也就是csv中的内容)相应的问题，然后通过问题随机选一个相应的答案序号，再通过答案序号定位所需答案。  
保存两个字典。

In [97]:
qa_dic = {}
num_a_dic = {}
for idx, row in df.iterrows():
    num_a_dic[idx] = row['Answer']
    context = row['Context']
    if context not in qa_dic:
        qa_dic[context] = []
    qa_dic[context].append(idx)
    
# save
with open('qa_dic.pickle', 'wb') as f:
    pickle.dump(qa_dic, f)
with open('num_a_dic.pickle', 'wb') as f:
    pickle.dump(num_a_dic, f)
qa_dic, num_a_dic

({'What are the symptoms of COVID-19?': [0, 1, 2, 3],
  'How does COVID-19 spread?': [4, 5],
  'Can the virus that causes COVID-19 be transmitted through the air?': [6],
  'Can CoVID-19 be caught from a person who has no symptoms?': [7, 8],
  'Can I catch COVID-19 from the feces of someone with the disease?': [9],
  'What can I do to protect myself and prevent the spread of disease?': [10,
   11,
   12,
   13,
   14,
   15,
   16],
  'What can I do to protect myself andProtection measures for persons who are in or have recently visited (past 14 days) areas where COVID-19 is spreading prevent the spread of disease?': [17],
  'Protection measures for persons who are in or have recently visited (past 14 days) areas where COVID-19 is spreading': [18],
  'How likely am I to catch COVID-19?': [19, 20, 21],
  'Should I worry about COVID-19': [22, 23],
  'Who is at risk of developing severe illness?': [24],
  'Are antibiotics effective in preventing or treating the COVID-19?': [25,
   26],
  '

创建完整的文本list(用来预训练词向量)

In [30]:
alltext_raw = list(df['Context']) + list(df['Answer'])
alltext_raw = list(set(alltext_raw))
len(alltext_raw), alltext_raw[:3]

(125,
 ['Avoid touching eyes, nose and mouth.',
  'At present there is no evidence that they are at higher risk of severe illness than the general population.',
  'Are smokers and tobacco users at higher risk of COVID-19 infection?'])

### Preprocessing sentences
分为三步。
1. 分词+小写化+stemming。英文用空格分隔单词，之间利用空格分词；所有字母均小写化，以统一词汇；stemming词干提取，抽取词的词干形式，使不同形式但表意一致的单词表现为同一形式。
2. 使用nltk库删除停用词。去除停用词是为了让模型忽略那些通用的词汇，而这些通用词汇通常对句子表意没有关联。
3. 填充句子，使所有句子长度相同。创建单词词典word_to_token(单词对应序号),token_to_word(序号对应单词)。

In [20]:
# Get stopwords list & punctuations list
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

# punctuations
english_punctuations = [',', '--', '<', '>', '.', ':', ';', '?', '(', ')', '[', ']', '&', '\'\'', '``', '!', '*', '@', '#', '$', '%']
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [94]:
# (split words + lowercase + stem) + remove stopwords
stemmer = LancasterStemmer()

def preprocess_sen(sen):
    # split words + stem
    wordlist = [stemmer.stem(word.lower()) for word in word_tokenize(q)]
    # wordlist = data['review_content'].lower().split()

    # remove stopwords
    outlist = []  # 每个句子输出结果 - list
    for word in wordlist:
        if word not in stopwords and word not in english_punctuations:
            if word != '\t' and word != '\n' and word != ' ':
                outlist.append(word)
    return outlist

questions = []    # all questions set
for q in tqdm.tqdm(questions_raw):
    outlist = preprocess_sen(q)
    questions.append(outlist)
    
#save
with open('questions.pickle', 'wb') as f:
    pickle.dump(questions, f)
with open('questionsraw.pickle', 'wb') as f:
    pickle.dump(questions_raw, f)


alltext = []
for q in tqdm.tqdm(alltext_raw):
    outlist = preprocess_sen(q)
    alltext.append(outlist)
questions_raw[:5], questions[:5]

100%|█████████████████████████████████████████████████████████████████████████████████| 42/42 [00:00<00:00, 915.49it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 1440.67it/s]


(['Are there any medicines or therapies that can prevent or cure COVID-19?',
  'Are smokers and tobacco users at higher risk of COVID-19 infection?',
  'Is COVID-19 the same as SARS?',
  'Who is at risk of developing severe illness?',
  'What medical interventions are available for COVID-19 and influenza viruses?'],
 [['ar', 'ther', 'medicin', 'therapy', 'prev', 'cur', 'covid-19'],
  ['ar', 'smok', 'tobacco', 'us', 'high', 'risk', 'covid-19', 'infect'],
  ['covid-19', 'sam', 'sar'],
  ['risk', 'develop', 'sev', 'il'],
  ['med', 'interv', 'ar', 'avail', 'covid-19', 'influenz', 'virus']])

In [45]:
alltext_words = sum(alltext, [])
longest_length = max([len(_) for _ in questions])

# 取无重复的所有词的list
wordlist = list(set(alltext_words))
wordlist.append("<pad>")    # to fill in the blank
wordlist.append("<unk>")    # to represent the unknown word

# save wordlist
with open('WordList.txt', 'w', encoding='utf8') as f:
    for word in wordlist:
        f.write(word)
        f.write('\n')

In [28]:
# Fill in the sentences to max length
padded_questions = []
for q in questions:
    if len(q) < longest_length:
        q.extend(["<pad>"] * (longest_length - len(q)))
    padded_questions.append(q)
padded_questions[:2]

[['ar',
  'ther',
  'medicin',
  'therapy',
  'prev',
  'cur',
  'covid-19',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>'],
 ['ar',
  'smok',
  'tobacco',
  'us',
  'high',
  'risk',
  'covid-19',
  'infect',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>']]

In [33]:
#creare word<->token dic
# word -> token
word_to_token = {word: token for token, word in enumerate(wordlist)}
# token -> word
token_to_word = {token: word for word, token in word_to_token.items()}
word_to_token

{'adult': 0,
 '14': 1,
 'prev': 2,
 '2019': 3,
 'therapy': 4,
 'interv': 5,
 'ev': 6,
 'jump': 7,
 'caus': 8,
 'c': 9,
 'season': 10,
 'throat': 11,
 'howev': 12,
 'mask': 13,
 'dog': 14,
 'anywh': 15,
 'bend': 16,
 'wash': 17,
 'milk': 18,
 'day': 19,
 'right': 20,
 'tissu': 21,
 'kong': 22,
 'urg': 23,
 'fev': 24,
 '2003': 25,
 '6': 26,
 'recov': 27,
 'chin': 28,
 'think': 29,
 'wid': 30,
 'inst': 31,
 '鈥': 32,
 'wild': 33,
 'high': 34,
 'first': 35,
 'dat': 36,
 'suggest': 37,
 'pet': 38,
 'hold': 39,
 'respons': 40,
 'period鈥': 41,
 'rout': 42,
 'food': 43,
 'anyth': 44,
 'discov': 45,
 'reliev': 46,
 'look': 47,
 'poss': 48,
 'aw': 49,
 'direct': 50,
 'unknown': 51,
 'contamin': 52,
 'breastmilk': 53,
 'cas': 54,
 'dist': 55,
 'slight': 56,
 'occ': 57,
 'grad': 58,
 'cough': 59,
 'situ': 60,
 'fiv': 61,
 'piec': 62,
 'tradit': 63,
 'rapid': 64,
 'much': 65,
 'put': 66,
 'unwel': 67,
 'disinfect': 68,
 'hav': 69,
 'ourselv': 70,
 'sneez': 71,
 'city': 72,
 'common': 73,
 'therapeut

### 利用所有文本预训练词向量模型
这里使用FastText模型来预训练。

In [51]:
alltext[:3], len(alltext)

([['avoid', 'touch', 'ey', 'nos', 'mou'],
  ['pres', 'ther', 'evid', 'ar', 'high', 'risk', 'sev', 'il', 'gen', 'pop'],
  ['ar', 'smok', 'tobacco', 'us', 'high', 'risk', 'covid-19', 'infect']],
 125)

解释一下这里使用的FastText模型的参数  
* window：即词向量上下文最大距离，window越大，则和某一词较远的词也会产生上下文关系。默认值为5，在实际使用中，可以根据实际的需求来动态调整这个window的大小。如果是小语料则这个值可以设的更小。我们的实验语料相对较小，用3比较合适。  
* sg：对word2vec两个模型的选择了。如果是0， 则是CBOW模型；是1则是Skip-Gram模型；默认是0即CBOW模型。这里用skip-gram模型来训练。
* min_count: 词语出现频率小于该数量的会被舍弃。这里取1，没有单词会被舍弃。

In [96]:
from gensim.models import FastText

model = FastText(window=3, sg=1, min_count=1)
model.build_vocab(alltext)
model.train(alltext, total_examples=model.corpus_count, epochs=model.iter)
model.save('fasttext.model')  # save model

  """


In [79]:
# 词向量实例
model["adult"]

  


array([-6.2693696e-04, -9.8457863e-04,  9.8339701e-04,  1.6925446e-04,
        1.4094013e-03,  1.8271153e-03, -2.9640394e-04, -1.0704385e-03,
        1.1594658e-03,  5.7776051e-04,  7.5473665e-04, -9.7423053e-04,
       -7.4995006e-04,  1.9680714e-04,  1.3524632e-03, -2.6108832e-03,
        4.6914406e-04, -1.9118749e-03, -2.2244643e-04, -1.1623199e-03,
       -6.1066344e-04, -5.4314069e-04, -7.4876385e-04,  1.9070458e-03,
       -8.4244210e-04, -1.1510347e-04, -5.4259732e-04, -5.8793928e-04,
       -2.6635404e-03, -3.3723423e-03,  6.4604980e-04,  1.5556007e-03,
       -1.3286719e-03, -1.9271809e-04, -1.4150098e-03,  3.4157922e-03,
       -8.3717494e-04, -1.0564235e-03,  2.7153147e-03, -1.4211363e-04,
       -1.1447412e-03, -1.4057418e-03,  7.9186269e-05,  1.2729666e-03,
        2.0843300e-03, -6.0061464e-04, -7.3077006e-04, -1.3162442e-03,
       -2.0611386e-03, -2.1279163e-03,  1.1862952e-03,  7.0707341e-05,
        2.6898419e-03,  2.5865627e-03,  1.1982427e-03, -2.5961166e-03,
      

In [95]:
if 'hshs' not in model.wv.index2word:
    print('no')

no


### 实现QA机器人
一般的QA机器人需要预训练一个文本相似度模型，比如ESIM。然而，我们收集到的数据仅仅有问题-答案对，相似/不相似句子对(比如句子1，句子2相似或者不相似)，属于无监督数据，我们难以用深度学习模型训练一个文本相似度模型。  
对于无监督文本，目前较为常见的方法就是预训练一个词向量模型，将文本通过词向量模型转化为句向量。本次实验采用的方法是将句子通过预训练好的FastText模型得到每个词的词向量，然后以TF-IDF作为权重加和，从而得到句向量。机器人获得了用户的query之后，将query转换为句向量，与问答库中的问题的句向量计算余弦距离，取最高者的答案回复。若余弦距离均不超过  ，则返回“没有该问题”。

由于jupyter很难实现交互，因此机器人的主程序在chatbot.py中