In [None]:
import numpy as np
import pandas as pd
import spacy
import cython
import math
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import euclidean, cosine, canberra, correlation

In [None]:
sem_list = ['euclidean', 'cosine', 'cosine_angle', 'canberra', 'correlation']
df_sem = pd.DataFrame(columns=sem_list)

In [None]:
START_INDEX = 0
END_INDEX = 500000
NUMBER_OF_FEATURES = len(df_sem.columns)

In [None]:
print('LOADING TRAINING DATA')

In [None]:
train_data = pd.read_csv("../Data/test_cleaned.csv")
df = train_data[START_INDEX:END_INDEX] 
#df = pd.read_csv("../Data/train_cleaned.csv")
df.fillna('NO QUESTION', inplace=True)
print('LOADING DONE ')#+str(START_INDEX))
#del train_data

In [None]:
print('LOADING WORDNET')

In [None]:
nlp = spacy.load('en_core_web_lg')
print('WORDNET LOADED')

In [None]:
print("PERFORMING TFIDF ANALYSIS")

In [None]:
questions = list(df['question1']) + list(df['question2'])
tfidf = TfidfVectorizer(lowercase=False,)
tfidf.fit_transform(questions)
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
print('TFIDF ANALYSIS DONE')

In [None]:
def normalize(vec):
    n = np.linalg.norm(vec)
    if n == 0:
        return vec
    return vec/n

In [None]:
def sent2vec(q1, q2):
    doc_1 = nlp(q1)
    doc_2 = nlp(q2)
    m_1 = np.zeros([len(doc_1), 300])
    m_2 = np.zeros([len(doc_2), 300])
    for word in doc_1:
        vec = word.vector
        try:
            idf = word2tfidf[str(word)]
        except:
            idf = 0
        m_1 += vec * idf
    m_1 = m_1.mean(axis=0)
    for word in doc_2:
        vec = word.vector
        try:
            idf = word2tfidf[str(word)]
        except:
            idf = 0
        m_2 += vec * idf
    m_2 = m_2.mean(axis=0)
    
    return normalize(m_1), normalize(m_2)

In [None]:
def similar(q1, q2):
    v1, v2 = sent2vec(q1, q2)
    cos = cosine(v1, v2)
    return euclidean(v1, v2), cos, math.degrees(math.acos(cos)), canberra(v1, v2), correlation(v1, v2)

In [None]:
for i in tqdm(range(START_INDEX, START_INDEX+len(df['question1'])), desc='CREATING SEMANTIC FEATURES'):
    feature = np.empty(NUMBER_OF_FEATURES) * np.nan
    feature[0], feature[1], feature[2], feature[3], feature[4]= similar(df['question1'][i], df['question2'][i])
    df_sem.loc[len(df_sem)] = feature