In [1]:
from itertools import permutations, repeat
import nltk
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import wordnet as wn
from bs4 import BeautifulSoup
from nltk.stem.snowball import SnowballStemmer
from gensim.models import word2vec

1. Modelos bag of words
2. Pre-processamento com lowercasing, stemming e remoção de caracteres não alfa-numericos

In [2]:
def correct_text(s):
    
    if isinstance(s, str):
        s = s.lower()
        s = s.replace(".", ". ")
        for s1 in range(0, 10):
            s = s.replace(". " + str(s1), "." + str(s1))
        
    
    s = s.lower()
    s = s.replace(" in.","in.")
    s = s.replace(" inch","in.")
    s = s.replace("inch","in.")
    s = s.replace(" in ","in. ")
    s = s.replace(" ft ","ft. ")
    s = s.replace(" ft.","ft.")
    s = s.replace(" foot","ft.")
    s = s.replace(" feet","ft.")
    s = s.replace("foot","ft.")
    s = s.replace("feet","ft.")
    s = s.replace(" ft ","ft. ")
    s = s.replace(" gallon ","gal. ")
    s = s.replace("gallon","gal.")
    s = s.replace(" oz.","oz.")
    s = s.replace(" ounce","oz.")
    s = s.replace("ounce","oz.")
    s = s.replace(" oz ","oz. ")
    s = s.replace(" cm.","cm.")
    s = s.replace(" cm ","cm. ")
    s = s.replace('H x', 'height')
    s = s.replace('sq.', 'square')
    s = s.replace('cu.', 'cubic')
    s = s.replace('lbs.', 'pounds')
    s = s.replace('W x', 'width')
    s = s.replace('H x', 'height')
    s = s.replace('Ah ', 'amphere')
    
    return s

def clean_text(text):
    text = correct_text(text)
    # Remove '&nbsp;' from the text content before HTML tags strip off.
    text.replace('&nbsp;', ' ')
    # Remove HTML tags
    text = BeautifulSoup(text, "lxml").get_text(separator=" ")
    # Replace all punctuation and special characters by space
    text.replace("[ &<>)(_,.;:!?/-]+", " ")
    # Remove the apostrophe's
    text.replace("'s\\b", "")
    # Remove the apostrophe
    text.replace("[']+", "")
    # Remove the double quotes
    text.replace("[\"]+", "")
    # Convert to lower case, split into individual words
    words = text.lower().split()
    return( " ".join( words ))
 

In [3]:
train_data = pd.read_csv('../data/train.csv', encoding="ISO-8859-1")
test_data = pd.read_csv('../data/test.csv', encoding="ISO-8859-1")
attribute_data = pd.read_csv('../data/attributes.csv')
descriptions = pd.read_csv('../data/product_descriptions.csv')

train_data = pd.merge(train_data, descriptions, on="product_uid", how="left")
test_data = pd.merge(test_data, descriptions, on="product_uid", how="left")

product_count = pd.DataFrame(pd.Series(train_data.groupby(["product_uid"]).size(), name="product_count"))
product_count = pd.DataFrame(pd.Series(test_data.groupby(["product_uid"]).size(), name="product_count"))

In [4]:
def clean(sentence):
    
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = PorterStemmer()
    tokens = tokenizer.tokenize(sentence.lower())
    tokens = [token for token in tokens if token not in english_sw]
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

In [5]:
english_sw = stopwords.words('english')


In [6]:
train_data['product_description'] = train_data.apply(lambda x: clean_text(x['product_description']),axis=1)
train_data['product_title'] = train_data.apply(lambda x: clean_text(x['product_title']),axis=1)
train_data['search_term'] = train_data.apply(lambda x: clean_text(x['search_term']),axis=1)

In [7]:
test_data['product_description'] = test_data.apply(lambda x: clean_text(x['product_description']),axis=1)
test_data['product_title'] = test_data.apply(lambda x: clean_text(x['product_title']),axis=1)
test_data['search_term'] = test_data.apply(lambda x: clean_text(x['search_term']),axis=1)

In [8]:
train_data['description_tokens'] = train_data.apply(lambda x: word_tokenize(x['product_description']), axis=1)
train_data['description_tokens_clean'] = train_data.apply(lambda x: clean(x['product_description']), axis=1)

In [9]:
test_data['description_tokens'] = test_data.apply(lambda x: word_tokenize(x['product_description']), axis=1)
test_data['description_tokens_clean'] = test_data.apply(lambda x: clean(x['product_description']), axis=1)

In [10]:
train_data['title_tokens'] = train_data.apply(lambda x: word_tokenize(x['product_title']), axis=1)
train_data['title_tokens_clean'] = train_data.apply(lambda x: clean(x['product_title']), axis=1)

In [11]:
test_data['title_tokens'] = test_data.apply(lambda x: word_tokenize(x['product_title']), axis=1)
test_data['title_tokens_clean'] = test_data.apply(lambda x: clean(x['product_title']), axis=1)

In [12]:
train_data['search_tokens'] = train_data.apply(lambda x: word_tokenize(x['search_term']), axis=1)
train_data['search_tokens_clean'] = train_data.apply(lambda x: clean(x['search_term']), axis=1)

In [13]:
test_data['search_tokens'] = test_data.apply(lambda x: word_tokenize(x['search_term']), axis=1)
test_data['search_tokens_clean'] = test_data.apply(lambda x: clean(x['search_term']), axis=1)

In [14]:
train_data['n_tokens_desc'] = train_data.apply(lambda x: len(x['description_tokens']), axis=1)
train_data['n_tokens_title'] = train_data.apply(lambda x: len(x['title_tokens']), axis=1)
train_data['n_tokens_search'] = train_data.apply(lambda x: len(x['search_tokens']), axis=1)

In [15]:
test_data['n_tokens_desc'] = test_data.apply(lambda x: len(x['description_tokens']), axis=1)
test_data['n_tokens_title'] = test_data.apply(lambda x: len(x['title_tokens']), axis=1)
test_data['n_tokens_search'] = test_data.apply(lambda x: len(x['search_tokens']), axis=1)

In [16]:
train_data.head(3)

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description,description_tokens,description_tokens_clean,title_tokens,title_tokens_clean,search_tokens,search_tokens_clean,n_tokens_desc,n_tokens_title,n_tokens_search
0,2,100001,simpson strong-tie 12-gauge angle,angle bracket,3.0,"not only do angles make joints stronger, they ...","[not, only, do, angles, make, joints, stronger...","[angl, make, joint, stronger, also, provid, co...","[simpson, strong-tie, 12-gauge, angle]","[simpson, strong, tie, 12, gaug, angl]","[angle, bracket]","[angl, bracket]",147,4,2
1,3,100001,simpson strong-tie 12-gauge angle,l bracket,2.5,"not only do angles make joints stronger, they ...","[not, only, do, angles, make, joints, stronger...","[angl, make, joint, stronger, also, provid, co...","[simpson, strong-tie, 12-gauge, angle]","[simpson, strong, tie, 12, gaug, angl]","[l, bracket]","[l, bracket]",147,4,2
2,9,100002,behr premium textured deckover 1-gal. #sc-141 ...,deck over,3.0,behr premium textured deckover is an innovativ...,"[behr, premium, textured, deckover, is, an, in...","[behr, premium, textur, deckov, innov, solid, ...","[behr, premium, textured, deckover, 1-gal, ., ...","[behr, premium, textur, deckov, 1, gal, sc, 14...","[deck, over]",[deck],196,13,2


### Bag-of-Word Features

1. feature_1 = is search term a substring in title?
2. feature_2 = is search term a substring in description?
3. feature_3 = proportion of terms of search in title - no stemm, no stopword removal
4. feature_4 = proportion of terms of search in title - stemm and stopword removal
5. feature_5 = proportion of terms of search in description - no stemm, no stopword removal
6. feature_6 = proportion of terms of search in description - stemm and stopword removal
7. feature_7 = length of search
8. feature_8 = length of description
9. feature_9 = length of title
7. features_10 to 43 = if word i in search is in description
7. features_44 to 77 = if word i in search is in title

#### Ideias

1. 1 feature para cada palavra de busca indicando se ela está no titulo ou descrição
2. Usando word2vec, distance entre palavras da busca e do titulo/descrição
3. Usando topic modeling: Probabilidade de busca conter o mesmo topico que titulo/descrição

In [17]:
train_data['feature_1'] = train_data.apply(lambda x: int(x['search_term'] in x['product_title']), axis=1)
train_data['feature_2'] = train_data.apply(lambda x: int(x['search_term'] in x['product_description']), axis=1)
train_data['feature_3'] = train_data.apply(lambda x: float(len( set(x['search_tokens']).intersection(set(x['title_tokens']))))/len( set(x['search_tokens'])) , axis=1)
train_data['feature_4'] = train_data.apply(lambda x: float(len( set(x['search_tokens_clean']).intersection(set(x['title_tokens_clean']))))/len( set(x['search_tokens'])) , axis=1)
train_data['feature_5'] = train_data.apply(lambda x: float(len( set(x['search_tokens']).intersection(set(x['description_tokens']))))/len( set(x['search_tokens'])) , axis=1)
train_data['feature_6'] = train_data.apply(lambda x: float(len( set(x['search_tokens_clean']).intersection(set(x['description_tokens_clean']))))/len( set(x['search_tokens'])) , axis=1)
train_data['feature_7'] = train_data.apply(lambda x: float(len(x['search_tokens'])), axis=1)
train_data['feature_8'] = train_data.apply(lambda x: float(len(x['description_tokens'])), axis=1)
train_data['feature_9'] = train_data.apply(lambda x: float(len(x['title_tokens'])), axis=1)

In [18]:
test_data['feature_1'] = test_data.apply(lambda x: int(x['search_term'] in x['product_title']), axis=1)
test_data['feature_2'] = test_data.apply(lambda x: int(x['search_term'] in x['product_description']), axis=1)
test_data['feature_3'] = test_data.apply(lambda x: float(len( set(x['search_tokens']).intersection(set(x['title_tokens']))))/len( set(x['search_tokens'])) , axis=1)
test_data['feature_4'] = test_data.apply(lambda x: float(len( set(x['search_tokens_clean']).intersection(set(x['title_tokens_clean']))))/len( set(x['search_tokens'])) , axis=1)
test_data['feature_5'] = test_data.apply(lambda x: float(len( set(x['search_tokens']).intersection(set(x['description_tokens']))))/len( set(x['search_tokens'])) , axis=1)
test_data['feature_6'] = test_data.apply(lambda x: float(len( set(x['search_tokens_clean']).intersection(set(x['description_tokens_clean']))))/len( set(x['search_tokens'])) , axis=1)
test_data['feature_7'] = test_data.apply(lambda x: float(len(x['search_tokens'])), axis=1)
test_data['feature_8'] = test_data.apply(lambda x: float(len(x['description_tokens'])), axis=1)
test_data['feature_9'] = test_data.apply(lambda x: float(len(x['title_tokens'])), axis=1)

In [19]:
train_data.head(3)

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description,description_tokens,description_tokens_clean,title_tokens,title_tokens_clean,...,n_tokens_search,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9
0,2,100001,simpson strong-tie 12-gauge angle,angle bracket,3.0,"not only do angles make joints stronger, they ...","[not, only, do, angles, make, joints, stronger...","[angl, make, joint, stronger, also, provid, co...","[simpson, strong-tie, 12-gauge, angle]","[simpson, strong, tie, 12, gaug, angl]",...,2,0,0,0.5,0.5,0.0,0.5,2,147,4
1,3,100001,simpson strong-tie 12-gauge angle,l bracket,2.5,"not only do angles make joints stronger, they ...","[not, only, do, angles, make, joints, stronger...","[angl, make, joint, stronger, also, provid, co...","[simpson, strong-tie, 12-gauge, angle]","[simpson, strong, tie, 12, gaug, angl]",...,2,0,0,0.0,0.0,0.0,0.0,2,147,4
2,9,100002,behr premium textured deckover 1-gal. #sc-141 ...,deck over,3.0,behr premium textured deckover is an innovativ...,"[behr, premium, textured, deckover, is, an, in...","[behr, premium, textur, deckov, innov, solid, ...","[behr, premium, textured, deckover, 1-gal, ., ...","[behr, premium, textur, deckov, 1, gal, sc, 14...",...,2,0,0,0.0,0.0,0.5,0.5,2,196,13


In [21]:
max([len(x) for x in test_data['search_tokens']])

14

In [22]:
def word_search(word_list, index, tokens):
    if len(word_list) < index + 1:
        return 0
    text = ''.join(tokens)
    return int(word_list[index] in text)

In [None]:
for index in range(0,14):
    train_data['feature_{}'.format(index+10)] = train_data.apply(lambda x: word_search(x['search_tokens'], index, x['title_tokens']), axis=1)
    test_data['feature_{}'.format(index+10)] = test_data.apply(lambda x: word_search(x['search_tokens'], index, x['title_tokens']), axis=1)

In [25]:
for index in range(0,14):
    train_data['feature_{}'.format(index+24)] = train_data.apply(lambda x: word_search(x['search_tokens'], index, x['description_tokens']), axis=1)
    test_data['feature_{}'.format(index+24)] = test_data.apply(lambda x: word_search(x['search_tokens'], index, x['description_tokens']), axis=1)

In [26]:
train_data.columns

Index([                      u'id',              u'product_uid',
                  u'product_title',              u'search_term',
                      u'relevance',      u'product_description',
             u'description_tokens', u'description_tokens_clean',
                   u'title_tokens',       u'title_tokens_clean',
                  u'search_tokens',      u'search_tokens_clean',
                  u'n_tokens_desc',           u'n_tokens_title',
                u'n_tokens_search',                u'feature_1',
                      u'feature_2',                u'feature_3',
                      u'feature_4',                u'feature_5',
                      u'feature_6',                u'feature_7',
                      u'feature_8',                u'feature_9',
                     u'feature_10',               u'feature_11',
                     u'feature_12',               u'feature_13',
                     u'feature_14',               u'feature_15',
                     u'fe

In [27]:
bow_features_train = train_data[['id', 'relevance'] + ['feature_{0}'.format(i) for i in range(1,38)]]
bow_features_test = test_data[['id'] + ['feature_{0}'.format(i) for i in range(1,38)]]

In [28]:
bow_features_train.to_csv('../data/bow_features.csv', index=False)
bow_features_test.to_csv('../data/bow_features_test.csv', index=False)

### Word2Vec Features

1. feature_1 = is search term a substring in title?
2. feature_2 = is search term a substring in description?
3. feature_3 = proportion of terms of search in title - no stemm, no stopword removal
4. feature_4 = proportion of terms of search in title - stemm and stopword removal
5. feature_5 = proportion of terms of search in description - no stemm, no stopword removal
6. feature_6 = proportion of terms of search in description - stemm and stopword removal
7. feature_7 = length of search
8. feature_8 = length of description
9. feature_9 = length of title
7. features_10 to 23 = lowest similarity of word i in search and the words in the description
7. features_24 to 37 = lowest similarity of word i in search and the words in the title

#### Ideias

1. 1 feature para cada palavra de busca indicando se ela está no titulo ou descrição
2. Usando word2vec, distance entre palavras da busca e do titulo/descrição
3. Usando topic modeling: Probabilidade de busca conter o mesmo topico que titulo/descrição

In [29]:
descriptions = list(train_data.product_description)
title = list(train_data.product_title)


In [30]:
sentences = descriptions + title

In [31]:
sentences = [[word for word in sentence.lower().split()] for sentence in sentences]

In [32]:
from collections import defaultdict

frequency = defaultdict(int)
for sentence in sentences:
    for token in sentence:
        frequency[token] += 1
        
sentences = [[token for token in sentence if frequency[token] > 1] for sentence in sentences]


In [33]:
model = word2vec.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)

In [None]:
#query_terms = set(itertools.chain(*train_data.search_tokens_clean))
#title_terms = set(itertools.chain(*train_data.title_tokens_clean))
#description_terms = set(itertools.chain(*train_data.description_tokens_clean))

In [34]:
def lowest_similarity(search_token_list, attribute_token_list, index, model):
    
    if len(search_token_list) < index + 1:
        return 0.
    else:
        search_token = search_token_list[index]
    
    similarities = []
    tuples = [[search_token, attribute_token]  for attribute_token in attribute_token_list]
    
    for x,y in tuples:
        try:
            similarities.append(model.similarity(x,y))
        except:
            continue
            
    if not similarities:
        return 0.
    
    return min(similarities)


In [35]:
for index in range(0,17):
    train_data['feature_{}'.format(index+10)] = train_data.apply(lambda x: lowest_similarity(x['search_tokens'], x['title_tokens'], index, model), axis=1)
    test_data['feature_{}'.format(index+10)] = test_data.apply(lambda x: lowest_similarity(x['search_tokens'], x['title_tokens'], index, model), axis=1)

In [36]:
for index in range(0,17):
    train_data['feature_{}'.format(index+27)] = train_data.apply(lambda x: lowest_similarity(x['search_tokens'], x['description_tokens'], index, model), axis=1)
    test_data['feature_{}'.format(index+27)] = test_data.apply(lambda x: lowest_similarity(x['search_tokens'], x['description_tokens'], index, model), axis=1)

In [37]:
w2v_features_train = train_data[['id', 'relevance'] + ['feature_{0}'.format(i) for i in range(1,38)]]
w2v_features_test = test_data[['id'] + ['feature_{0}'.format(i) for i in range(1,38)]]

In [38]:
w2v_features_train.to_csv('../data/w2v_features.csv', index=False)
w2v_features_test.to_csv('../data/w2v_features_test.csv', index=False)

In [39]:
!ls ../data

attributes.csv		  features.csv		       test.csv
bag_of_word_features.csv  product_descriptions.csv     train.csv
bow_features.csv	  relevance_instructions.docx  w2v_features.csv
bow_features_test.csv	  sample_submission.csv        w2v_features_test.csv
