In [26]:
import numpy as np
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

In [2]:
%run scripts/helper.py

In [3]:
crowd_train = load_file('./data/train.csv/train.csv', None)

In [4]:
# General text related features
# 1. Text length of the title

In [5]:
crowd_train.columns

Index([u'id', u'query', u'product_title', u'product_description', u'median_relevance', u'relevance_variance'], dtype='object')

In [6]:
crowd_train['title_length'] = crowd_train.apply(lambda x: len(x['product_title']), axis=1)

In [8]:
# lets see how correlated this feature is with response variable
crowd_train[['title_length', 'median_relevance']].corr()

Unnamed: 0,title_length,median_relevance
title_length,1.0,0.03098
median_relevance,0.03098,1.0


In [9]:
# Not a high correlation

In [10]:
# 2. Number of words in the title
crowd_train['num_words_title'] = crowd_train.apply(lambda x: len(x['product_title'].split(' ')), axis=1)

In [11]:
crowd_train[['num_words_title', 'median_relevance']].corr()

Unnamed: 0,num_words_title,median_relevance
num_words_title,1.0,0.019942
median_relevance,0.019942,1.0


In [12]:
# 3. Number of words in the prodcut description
crowd_train['num_words_desc'] = crowd_train.apply(lambda x: len(x['product_description']), axis=1)

In [13]:
crowd_train[['num_words_desc', 'median_relevance']].corr()

Unnamed: 0,num_words_desc,median_relevance
num_words_desc,1.0,-0.009985
median_relevance,-0.009985,1.0


In [31]:
# 4. Ratio of words in query other than stopwords that match in the title and description
def f(x):
    query = x['query'].lower()
    title = x['product_title'].lower()
    desc = x['product_description'].lower()
    stop = stopwords.words('english')
    total_words = len(title.split(' ')) + len(desc.split(' '))
    count = 0
    
    unique_query_terms = list(set(query.split(' ')))
    for q in unique_query_terms:
        if q not in stop:
            if q in title or q in desc:
                count += 1
    
    return (count * 1.) / total_words

crowd_train['ratio_query_terms_in_res'] = crowd_train.apply(f, axis=1)

In [32]:
crowd_train[['ratio_query_terms_in_res', 'median_relevance']].corr()

Unnamed: 0,ratio_query_terms_in_res,median_relevance
ratio_query_terms_in_res,1.0,0.158718
median_relevance,0.158718,1.0


In [5]:
# Jaccard distance between query and (title + description)
def jaccard(x):
    query = x['query'].lower()
    title = x['product_title'].lower()
    description = x['product_description'].lower()
    response = title + ' ' + description
    
    query_set = set(query.split(' '))
    response_set = set(response.split(' '))
    
    query_response_intersection_len = len(query_set & response_set)
    query_response_union_len = len(query_set | response_set)
    
    return (query_response_intersection_len * 1.) / (query_response_union_len)

crowd_train['jaccard_dist'] = crowd_train.apply(jaccard, axis=1)

In [6]:
# lets how much this variable is correlated with distance
crowd_train[['jaccard_dist', 'median_relevance']].corr()

Unnamed: 0,jaccard_dist,median_relevance
jaccard_dist,1.0,0.181246
median_relevance,0.181246,1.0


In [7]:
crowd_train.jaccard_dist.head()

0    0.000000
1    0.025641
2    0.166667
3    0.020833
4    0.051282
Name: jaccard_dist, dtype: float64

In [19]:
# Check if query term in response
def is_query_in_response(train):
    query_terms = train['query'].split(' ')
    response = train['product_title'] + ' ' + train['product_description']
    
    stemmer = PorterStemmer()
    query_terms_stemmed = [stemmer.stem(q) for q in query_terms]
    response_stemmed = ''.join([stemmer.stem(r) for r in response])
    stop = stopwords.words('english')
       
    keyword = False
    
    for q in query_terms_stemmed:
        if q not in stop:
            keyword = True
            if response_stemmed.lower().find(q) == -1:
                return 0
    
    if keyword == False:
        return 0
    else:
        return 1

crowd_train['query_in_response'] = crowd_train.apply(is_query_in_response, axis=1)

In [20]:
crowd_train[['query_in_response', 'median_relevance']].corr()

Unnamed: 0,query_in_response,median_relevance
query_in_response,1.0,0.373949
median_relevance,0.373949,1.0


In [23]:
# lets find out how many query terms found in response
def count_query_terms_in_response(train):
    query_terms = train['query'].split(' ')
    unique_terms = list(set(query_terms))
    response = train['product_title'].lower() + ' ' + train['product_description'].lower()
    stemmer = PorterStemmer()
    query_terms_stemmed = [stemmer.stem(q) for q in unique_terms]
    response_stemmed = ''.join([stemmer.stem(r) for r in response])
    stop = stopwords.words('english')
    
    
    count = 0
    for q in query_terms_stemmed:
        if q not in stop:
            if response_stemmed.find(q) != -1:
                count += 1
    return count

crowd_train['num_terms_in_resp'] = crowd_train.apply(count_query_terms_in_response, axis=1)

In [24]:
crowd_train[['num_terms_in_resp', 'median_relevance']].corr()

Unnamed: 0,num_terms_in_resp,median_relevance
num_terms_in_resp,1.0,0.284113
median_relevance,0.284113,1.0


In [61]:
def lch_similarity(x):
    query = x['query'].lower()
    response = x['product_title'].lower() + ' ' + x['product_description'].lower()
    stop = stopwords.words('english')
    total = 0
    
    for q in query.split(' '):
        if q not in stop:
            query_noun = wn.synsets(q, pos=wn.NOUN)
            if len(query_noun) > 0:
                for r in response.split(' '):
                    if r not in stop:
                        synonyms = wn.synsets(r, pos=wn.NOUN)
                        if len(synonyms) > 0:
                            total += query_noun[0].lch_similarity(synonyms[0])
    
    return total

crowd_train['lch_similarity'] = crowd_train.apply(lch_similarity, axis=1)

In [63]:
crowd_train[['lch_similarity', 'median_relevance']].corr()

Unnamed: 0,lch_similarity,median_relevance
lch_similarity,1.0,0.004542
median_relevance,0.004542,1.0
