# 1. Creating the Data
## Simulation of Problem and Advice Pairs
If match equals 1, the advice/answer belongs to the problem. If match equals 0, the answer does not respond to the problem.

This is a small dummy data set.

In [9]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pegah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [116]:
d = {"problem":["I drink too much beer", "I don't exercise enough"],
     "advice":["Drink a non-alcoholic beverage between two drinks.",
               "Find friends with whom you can do the sport together!"]}
test = pd.DataFrame(data=d)
test["match"]=1
test.loc[2]=["I am overweighted","Follow a strict diet",1]
test.loc[3]=["I am always cold","Wear warmer cloths",1]
test.loc[4]=["I cant focus","Try mindfulness",1]
test.loc[5]=["I feel I have a depression","A dog said yes once",0]
test.loc[6]=["I drink too much beer","fish are nice",0]
test.loc[7]=["I am overweighted","sometimes it rains, sometimes the sun is out",0]
test.loc[8]=["I don't exercise enough","try to relax more often",0]
test.loc[9]=["I cant focus","best tv show ever",0]
test.loc[10]=["I am overweighted","china is a country",0]
test.loc[11]=["I am overweighted","japan is a country",0]
test.loc[12]=["I am not in a good shape. I don't feel fit","Follow a strict diet",1]
test.loc[13]=["I am freezing always","Wear warmer cloths",1]
test.loc[14]=["I have anxiety","Try mindfulness",1]
test.loc[15]=["i dont do sports","find a fitness trainer",1]
test.loc[16]=["i have stress","try to relax more often",1]
test.loc[17]=["I always feel nervousness. ","try to relax more often",1]
test.loc[18]=["I don't exercise enough","some singers are blond",0]
test.loc[19]=["I cant focus","yesterday or tomorrow the stock market is unusual",0]
test.loc[20]=["I am overweighted","flowers have many colors",0]
test

Unnamed: 0,problem,advice,match
0,I drink too much beer,Drink a non-alcoholic beverage between two dri...,1
1,I don't exercise enough,Find friends with whom you can do the sport to...,1
2,I am overweighted,Follow a strict diet,1
3,I am always cold,Wear warmer cloths,1
4,I cant focus,Try mindfulness,1
5,I feel I have a depression,A dog said yes once,0
6,I drink too much beer,fish are nice,0
7,I am overweighted,"sometimes it rains, sometimes the sun is out",0
8,I don't exercise enough,try to relax more often,0
9,I cant focus,best tv show ever,0


# 2. Natural Language Pre-Processing

Define functions for splitting, deleting punctuation, word selection and stemming.

Stemming not used here, but can be used if word embeddings are trained on data.

In [117]:
def pre_processing(df, columns=["problem", "advice"]):
    '''This function creates two new columns for each input column.
    One column without any punctuation or stop words and lower case,
    and one with the split words in a list.
    Additionally one columns with a combined list is created.'''
    for column in columns:
        df[column+"_pure"] = df[column].map(lambda x: re.sub('[^a-zA-Z]', ' ', x))
        #Making everything lowercase
        df[column+"_pure"] = df[column+"_pure"].map(lambda x: x.lower())
        #splitting the sentence into words
        df[column+"_pure"] = df[column+"_pure"].map(lambda x: x.split())
        #ps = PorterStemmer()
        # delete stop words and stemming the remaining words
        df[column+"_pure"] = df[column+"_pure"].map(lambda x:[word for word in x if not word in set(stopwords.words('english'))])
        df[column+"_pure"] = df[column+"_pure"].map(lambda x: ' '.join(x))
        df[column+"_split"]=df[column+"_pure"].apply(lambda x: x.split())
        df[column+"_split_nouns_and_verbs"]=df[column+"_pure"].apply(lambda x: get_nouns_and_verbs(x))
    df["both_split"]=df.apply(lambda x: x.problem_split + x.advice_split, axis=1)
    df["both_split_nouns_and_verbs"]=df.apply(lambda x: x.problem_split_nouns_and_verbs + x.advice_split_nouns_and_verbs, axis=1)

def stemming(df, columns=["problem", "advice"]):
    '''This function creates stemmed columns (similar as pre_processing function).'''
    for column in columns:
        df[column+"_stemming"] = df[column].map(lambda x: re.sub('[^a-zA-Z]', ' ', x))
        #Making everything lowercase
        df[column+"_stemming"] = df[column+"_stemming"].map(lambda x: x.lower())
        #splitting the sentence into words
        df[column+"_stemming"] = df[column+"_stemming"].map(lambda x: x.split())
        ps = PorterStemmer()
        # delete stop words and stemming the remaining words
        df[column+"_stemming"] = df[column+"_stemming"].map(lambda x:[ps.stem(word) for word in x if not word in set(stopwords.words('english'))])
        df[column+"_stemming"] = df[column+"_stemming"].map(lambda x: ' '.join(x))
        df[column+"_split_stem"]=df[column+"_stemming"].apply(lambda x: x.split())
    #df["both_stem"]=df.apply(lambda x: x.problem_split_stem + x.advice_split_stem, axis=1)

### Nouns and Verbs only
This function filters the text for nouns and verbs only. These word type seem the most useful for the task of understanding if an answer responds to a concern.

In [118]:
def get_nouns_and_verbs(lines):
    '''This function return all nouns and verbs of a sentence'''
    # function to test if something is a noun or verb (any tense)
    is_noun_verb = lambda pos: pos[:2] in ["NN","VB"]
    # select nouns and verbs
    tokenized = nltk.word_tokenize(lines)
    nouns_verbs = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun_verb(pos)] 
    return(nouns_verbs)

In [119]:
df = test
columns = ["problem", "advice"]
pre_processing(df, columns)
stemming(df,columns)
df.head(3)

Unnamed: 0,problem,advice,match,problem_pure,problem_split,problem_split_nouns_and_verbs,advice_pure,advice_split,advice_split_nouns_and_verbs,both_split,both_split_nouns_and_verbs,problem_stemming,problem_split_stem,advice_stemming,advice_split_stem
0,I drink too much beer,Drink a non-alcoholic beverage between two dri...,1,drink much beer,"[drink, much, beer]","[drink, beer]",drink non alcoholic beverage two drinks,"[drink, non, alcoholic, beverage, two, drinks]","[drink, beverage, drinks]","[drink, much, beer, drink, non, alcoholic, bev...","[drink, beer, drink, beverage, drinks]",drink much beer,"[drink, much, beer]",drink non alcohol beverag two drink,"[drink, non, alcohol, beverag, two, drink]"
1,I don't exercise enough,Find friends with whom you can do the sport to...,1,exercise enough,"[exercise, enough]","[exercise, enough]",find friends sport together,"[find, friends, sport, together]","[find, friends, sport]","[exercise, enough, find, friends, sport, toget...","[exercise, enough, find, friends, sport]",exercis enough,"[exercis, enough]",find friend sport togeth,"[find, friend, sport, togeth]"
2,I am overweighted,Follow a strict diet,1,overweighted,[overweighted],[overweighted],follow strict diet,"[follow, strict, diet]","[follow, diet]","[overweighted, follow, strict, diet]","[overweighted, follow, diet]",overweight,[overweight],follow strict diet,"[follow, strict, diet]"


### Feature Creation
1. Calculating the word embeddings using pre-trained FastText word embedding (trained on wikpedia, vocabulary size of one million, and embedded into 300 dimensions.
2. Substracting word-embedding vector for advice from word-embedding vector for problem elementwise
3. Create features of the difference regarding every dimension

In [30]:
import gensim



### Important
You have to download the ['wiki-news-300d-1M.vec'](https://fasttext.cc/docs/en/english-vectors.html) for this.

In [31]:
# This takes some time
from gensim.models import KeyedVectors
model2 = KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [34]:
# Impressive example of word embedding
# These are the words most similar to the word "sport"
print(model2.most_similar('sport'))

[('sports', 0.8248535990715027), ('football', 0.6988055109977722), ('Sport', 0.6917921304702759), ('sport-', 0.6783909797668457), ('athletics', 0.675710916519165), ('sportsman', 0.6616963148117065), ('soccer', 0.6516904830932617), ('sporting', 0.6513649821281433), ('boxing', 0.6464983820915222), ('Sports', 0.6409319043159485)]


### Calculate the "distance" between sentences

In [120]:
def list_sub(list1,list2):
    '''This list subtracts list2 element-wise from list1
    and return the difference c as a list.'''
    try:
        c = [a - b for a, b in zip(list1, list2)]
    except:
        pass
    return(c)

### Examples of Distances between words

In [121]:
a=list_sub(model2["diet"],model2["nutrition"])
b=list_sub(model2["diet"],model2["stone"])
c=map(abs, a)
d=map(abs, b)
print('The distance between the word "diet" and the word "nutrition" is', sum(c))
print('The distance between the word "diet" and the word "stone" is', sum(d))

The distance between the word "diet" and the word "nutrition" is 25.140400115400553
The distance between the word "diet" and the word "stone" is 34.320799998007715


### Create sentence embedding is needed.

In [122]:
# taken from https://ai.intelligentonlinetools.com/ml/text-vectors-word-embeddings-word2vec/
def sent_vectorizer(sent, model):
    '''This function averages the word embeddings (300 dimensions) of a sentence
    and thus produces a sentence embedding (300 dimensions).'''
    sent_vec =[]
    numw = 0
    for w in sent:
        try:
            if numw == 0:
                sent_vec = model[w]
            else:
                sent_vec = np.add(sent_vec, model[w])
            numw+=1
        except:
            pass
    
    return np.asarray(sent_vec) / numw

In [123]:
def avg_sent(df,model):
    '''Creates sentence embeddings using all words.'''
    df["avg_sentence_c_pt"]=df["problem_split"].apply(lambda x: sent_vectorizer(x, model))
    df["avg_sentence_a_pt"]=df["advice_split"].apply(lambda x: sent_vectorizer(x, model))
    
def avg_sent_nv(df,model):
    '''Creates sentence embedding using only nouns and verbs.'''
    df["avg_sentence_c_pt_nv"]=df["problem_split_nouns_and_verbs"].apply(lambda x: sent_vectorizer(x, model))
    df["avg_sentence_a_pt_nv"]=df["advice_split_nouns_and_verbs"].apply(lambda x: sent_vectorizer(x, model))
    
avg_sent(df,model2)
avg_sent_nv(df,model2)

In [124]:
df.head(2)

Unnamed: 0,problem,advice,match,problem_pure,problem_split,problem_split_nouns_and_verbs,advice_pure,advice_split,advice_split_nouns_and_verbs,both_split,both_split_nouns_and_verbs,problem_stemming,problem_split_stem,advice_stemming,advice_split_stem,avg_sentence_c_pt,avg_sentence_a_pt,avg_sentence_c_pt_nv,avg_sentence_a_pt_nv
0,I drink too much beer,Drink a non-alcoholic beverage between two dri...,1,drink much beer,"[drink, much, beer]","[drink, beer]",drink non alcoholic beverage two drinks,"[drink, non, alcoholic, beverage, two, drinks]","[drink, beverage, drinks]","[drink, much, beer, drink, non, alcoholic, bev...","[drink, beer, drink, beverage, drinks]",drink much beer,"[drink, much, beer]",drink non alcohol beverag two drink,"[drink, non, alcohol, beverag, two, drink]","[-0.0791, -0.06826667, 0.027566666, 0.02183333...","[-0.016916666, -0.065866664, 0.021750003, 0.06...","[-0.095649995, -0.086399995, 0.008300001, 0.05...","[-0.036166668, -0.031600002, 0.05066667, 0.049..."
1,I don't exercise enough,Find friends with whom you can do the sport to...,1,exercise enough,"[exercise, enough]","[exercise, enough]",find friends sport together,"[find, friends, sport, together]","[find, friends, sport]","[exercise, enough, find, friends, sport, toget...","[exercise, enough, find, friends, sport]",exercis enough,"[exercis, enough]",find friend sport togeth,"[find, friend, sport, togeth]","[-0.084750004, -0.052950002, -0.09095, 0.0191,...","[-0.061075002, -0.0038499986, 0.043325003, 0.0...","[-0.084750004, -0.052950002, -0.09095, 0.0191,...","[-0.091066666, -0.0054666647, 0.05796667, 0.02..."


In [125]:
def feature_creator(df,size=300,only_noun_and_verbs=True):
    '''This function calculates the differences in sentence embeddings
    elementwise and returns the dataframe with additional 300 features.'''
    dfx=df.copy(deep=False)
    # if PreTrained
    suffixes=["pt"]
    for suffix in suffixes:
        #df['problem_'+suffix] = 
        #df['advice_'+suffix] = 
        if only_noun_and_verbs==True:
            dfx["distance_"+suffix+"_nv"] = dfx.apply(lambda x: list_sub(x['avg_sentence_c_'+suffix+"_nv"],x['avg_sentence_a_'+suffix+"_nv"]), axis=1)
        else:
            dfx["distance_"+suffix] = dfx.apply(lambda x: list_sub(x['avg_sentence_c_'+suffix],x['avg_sentence_a_'+suffix]), axis=1)
        #Adding new columns, a column for each dimension of the differences-vector
        #This way is not working - find another way
        #if suffix=="t":
        #    j=1000
        #else:
        #    j=0
        #for i in range(j,size+j):
        if only_noun_and_verbs==True:
            for i in range(size):
                dfx[str(i)] = dfx["distance_"+suffix+"_nv"].apply(lambda x : x[i])
        else:
            for i in range(size):
                dfx[str(i)] = dfx["distance_"+suffix].apply(lambda x : x[i])
    return(dfx)

In [126]:
df2 = feature_creator(df,size=300)
df2.head(3)

Unnamed: 0,problem,advice,match,problem_pure,problem_split,problem_split_nouns_and_verbs,advice_pure,advice_split,advice_split_nouns_and_verbs,both_split,...,290,291,292,293,294,295,296,297,298,299
0,I drink too much beer,Drink a non-alcoholic beverage between two dri...,1,drink much beer,"[drink, much, beer]","[drink, beer]",drink non alcoholic beverage two drinks,"[drink, non, alcoholic, beverage, two, drinks]","[drink, beverage, drinks]","[drink, much, beer, drink, non, alcoholic, bev...",...,0.00585,-0.023767,-0.018083,0.095133,0.0897,-0.009617,-0.0542,0.042933,0.06325,-0.052583
1,I don't exercise enough,Find friends with whom you can do the sport to...,1,exercise enough,"[exercise, enough]","[exercise, enough]",find friends sport together,"[find, friends, sport, together]","[find, friends, sport]","[exercise, enough, find, friends, sport, toget...",...,-0.1053,0.013617,-0.011767,-0.023933,-0.053433,0.030083,0.0821,0.090517,-0.003083,-0.087217
2,I am overweighted,Follow a strict diet,1,overweighted,[overweighted],[overweighted],follow strict diet,"[follow, strict, diet]","[follow, diet]","[overweighted, follow, strict, diet]",...,0.14345,-0.06735,-0.11065,0.06765,-0.05335,-0.19145,0.10595,-0.03505,-0.08355,-0.02355


# 3. Prediction

In [128]:
# Features start from column 20
print(df2.columns[:25])
print(df2.columns[20:35])

Index(['problem', 'advice', 'match', 'problem_pure', 'problem_split',
       'problem_split_nouns_and_verbs', 'advice_pure', 'advice_split',
       'advice_split_nouns_and_verbs', 'both_split',
       'both_split_nouns_and_verbs', 'problem_stemming', 'problem_split_stem',
       'advice_stemming', 'advice_split_stem', 'avg_sentence_c_pt',
       'avg_sentence_a_pt', 'avg_sentence_c_pt_nv', 'avg_sentence_a_pt_nv',
       'distance_pt_nv', '0', '1', '2', '3', '4'],
      dtype='object')
Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14'],
      dtype='object')


In [112]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

def match_pred(dfx, seed):
    y=dfx["match"]
    # The word embedding features start at 20th columns (not a robust solution here though)
    X=dfx.iloc[:,20:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = seed)
    log = LogisticRegression(solver='liblinear')#regularization is applied by default
    log.fit(X_train, y_train)
    accuracy=log.score(X_test,y_test)
    y_pred = log.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    #print(cm)
    return(accuracy)

### Average test accuracy for 1000 random train-test-splits

In [135]:
# null rate
df.match.mean()

0.5238095238095238

#### A) Only using nouns and verbs

In [136]:
acc=[]
for i in range(1000):
    acc_i = match_pred(df2,i)
    acc.append(acc_i)
mean_acc=sum(acc)/len(acc)
mean_acc

0.597

#### B) Using all word types

In [137]:
df3 = feature_creator(df,size=300,only_noun_and_verbs=False)

In [138]:
acc=[]
for i in range(1000):
    acc_i = match_pred(df3,i)
    acc.append(acc_i)
mean_acc=sum(acc)/len(acc)
mean_acc

0.6040909090909102

### Evaluation
Given the extremely small data set (n=20) and the short sentences, this is a promising first result. The null rate is 52 % and we can see an improvement of around 8 percentage points.

In this very small case, using all word types is slightly superior.