# High accuracy recall

In [1]:
import pandas as pd
import re
import random
import math

FILES_ROOT_PATH = '~/OneDrive - eBay Inc/High accuracy recall/'
MIN_WORD_COUNT = 5
USE_PHRASES = False
MIN_SIMILARITY_FOR_SYNONYMS = 0.7

## Synonyms
Could be used later on, not really used now

In [2]:
#pd_all_synonyms = pd.read_csv('~/OneDrive - eBay Inc/experiments/synonyms_from_searchpaths_split_keywords.csv')
pd_all_synonyms = pd.read_csv(FILES_ROOT_PATH + '/synonyms_english.csv')
#pd_all_synonyms = pd.read_csv('~/OneDrive - eBay Inc/experiments/casint 3005 Synonyms/synonyms_odw_checked.csv')
#pd_all_synonyms = pd.read_csv('~/OneDrive - eBay Inc/experiments/synonyms_twh/synonyms_ga_levenstein_nongrouped.csv')

#output_path = '~/OneDrive - eBay Inc/experiments/synonyms_twh/synonyms_ga_levenstein_grouped.csv'
output_path = FILES_ROOT_PATH + '/synonyms_english_90pct.csv'

MIN_SIMILARITY = 0.9
# ((pd_all_synonyms.loc[(pd_all_synonyms['CANDIDATE_STATUS']=='APPROVED')])
# .to_csv('~/OneDrive - eBay Inc/experiments/casint 3005 Synonyms/synonym_export_cleaned.csv'
#         , encoding='UTF8'))


In [3]:
pd_all_synonyms = pd_all_synonyms.loc[(pd_all_synonyms['CANDIDATE_STATUS']=='APPROVED')]
pd_all_synonyms = pd_all_synonyms.loc[(pd_all_synonyms['searches']>=10)]
pd_all_synonyms = pd_all_synonyms.loc[pd_all_synonyms['prev_keyword'].apply(lambda x: re.match('.*[0-9][a-z]?$', x) is None)]
pd_all_synonyms['keyword'] = pd_all_synonyms['prev_keyword']
pd_all_synonyms.head(5)

Unnamed: 0,CANDIDATE_STATUS,prev_keyword,synonyms,searches,similarities,keyword
1,APPROVED,sec,sel slc,33,"{sel=0.980449022031111, slc=0.9894410287758298}",sec
2,APPROVED,nail,nailer,228,{nailer=0.9501043566039408},nail
3,APPROVED,mororcycle,motorcycle,33,{motorcycle=0.9818345560015685},mororcycle
4,APPROVED,cortina,corona,56,{corona=0.9851060539554997},cortina
6,APPROVED,tun,tub,77,{tub=0.813100311719324},tun


In [4]:
pd.set_option('display.max_colwidth', -1)
pd_all_synonyms.loc[pd_all_synonyms.keyword=='ebike'].similarities

12622    {bikes=0.8487290907659067, e-bike=0.9849325735139168, bike=0.9105030980084756}
Name: similarities, dtype: object

In [5]:
import json
from json import JSONDecodeError

def extract_from_json(similarities, min_similarity):
    cleaned = similarities.replace('"', '_').replace('\\', '_')
    cleaned = re.sub(r'=([^0-9])', r'-\1', cleaned) 
    similarities = re.sub(r'([{]|, )([^=]*)=([0-9])', r'\1"\2":\3', cleaned) 
    try:
        d_synonyms = json.loads(similarities)
    except JSONDecodeError as e:
        print(cleaned, similarities)
        raise e
    return [key for key in d_synonyms if d_synonyms[key]>= min_similarity and re.match('.*[0-9][a-z]?$', key) is None]

extract_from_json(pd_all_synonyms.iloc[0]['similarities'], MIN_SIMILARITY)


['sel', 'slc']

In [6]:
pd_all_synonyms['filtered_synonyms'] = pd_all_synonyms['similarities'].apply(lambda x: extract_from_json(x, MIN_SIMILARITY))

In [7]:
pd_all_synonyms.loc[(pd_all_synonyms['prev_keyword']=='iphonex')]

Unnamed: 0,CANDIDATE_STATUS,prev_keyword,synonyms,searches,similarities,keyword,filtered_synonyms
3171,APPROVED,iphonex,iphone,69,{iphone=0.9979961841033886},iphonex,[iphone]


In [8]:
def collapse_synonyms(pd_all_synonyms, word_column='keyword', synonyms_column='filtered_synonyms'):
    word_to_cluster = {}
    cluster_to_synonyms = {}
    for i,row in pd_all_synonyms.iterrows():
        found_cluster = None
        word = row[word_column] # keyword
        #synonyms = row.synonyms.split(' ')
        synonyms = row[synonyms_column] # .filtered_synonyms
        #if 'fietsendrag' in row.keyword:
        #print(word, synonyms)
        if isinstance(synonyms,str):
            synonyms = [synonyms]
        
        for synonym in synonyms:
            found_cluster = word_to_cluster.get(synonym)
            if found_cluster:
                break
        if not found_cluster:
            found_cluster = word_to_cluster.get(word)

        if not found_cluster:
            # Totally new
            word_to_cluster.update({synonym:word for synonym in synonyms})
            word_to_cluster.update({word:word})
            cluster_to_synonyms.update({word:set([word] + synonyms)})
        else:
            # Existing cluster
            # print('Merging', word, found_cluster)
            word_to_cluster.update({synonym:found_cluster for synonym in synonyms})
            word_to_cluster.update({word:found_cluster})
            cluster_to_synonyms.update({found_cluster:set([word] + synonyms 
                                                 + list(cluster_to_synonyms[found_cluster]))})
    return word_to_cluster, cluster_to_synonyms

word_to_cluster, cluster_to_synonyms = collapse_synonyms(pd_all_synonyms)
pd_collapsed = pd.DataFrame([(word, list(cluster_to_synonyms[word])) for word in cluster_to_synonyms.keys()]
                           , columns=['keyword', 'synonyms'])        

In [9]:
import csv
pd_collapsed['flat_synonyms'] = pd_collapsed.apply(lambda r: ','.join(r.synonyms),axis=1)
pd_collapsed.to_csv(output_path
                    , index_label=False
                    , index=False
                    , header=False
                    , quoting = csv.QUOTE_NONE
                    , sep='|'
                    , quotechar="",  escapechar="\\"
                    , columns=['flat_synonyms']
                    , encoding='UTF8')

## NLTK
Only using stopwords for now :)

In [10]:
# First, you're going to need to import wordnet: 
import nltk
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('alpino')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cpieterse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package alpino to
[nltk_data]     C:\Users\cpieterse\AppData\Roaming\nltk_data...
[nltk_data]   Package alpino is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\cpieterse\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cpieterse\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cpieterse\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

## Actual work
### Start reading the data

In [11]:
# Define input data
from gensim.utils import simple_preprocess
X, y = [], []
documents = pd.read_csv(FILES_ROOT_PATH + 'documents.tsv', sep='\t')
#with open(FILES_ROOT_PATH + 'documents.tsv', "r") as infile:
#    for line in infile:
#        doc_id,price,title,category_breadcrumb,itemid_image_url = line.split("\t")
#        label = '0'
#        X.append(simple_preprocess(title, deacc=True))
#        y.append(label)
#X, y = np.array(X), np.array(y)



Additional processing

In [12]:
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
import unidecode

STOPWORDS = set(stopwords.words("english"))
# STEMMER = PorterStemmer()
STEMMER = SnowballStemmer("english")
TOKENIZER = RegexpTokenizer('[a-zA-Z0-9]+[a-zA-Z0-9-/]?[a-zA-Z0-9]+|[#$][\d\.]+|[a-z.]+')


def remove_periods_in_acronyms(x):
    """Remove the . in acronyms like d.c. or u.s.a -> dc / usa """
    if re.match('^([a-z][.])+[a-z]?$', x):    
        return x.replace('.', '')
    return x

def remove_hyphens_in_words(x):
    """Remove the hyphen in the middel of a word, like ka-zar -> kazar, x-men -> xmen"""
    if re.match('^[a-z]+[-][a-z]+$', x):    
        return x.replace('-', '')
    return x

print(remove_periods_in_acronyms('a.b'))
    
def tokenize_and_strip_accents(x, phrase_model=None):
    """
    Tokenize the sentence and remove stopwords. 
    Optionally: apply phrase detection with the supplied model
    """
    try:
        norm_string = ' ' + unidecode.unidecode(x).lower() + ' '
    except AttributeError as e:
        norm_string = ' ' + str(x) + ' '
        
    
    #tokenizer = TreebankWordTokenizer()
    words = TOKENIZER.tokenize(norm_string)
    filtered_words = [STEMMER.stem(word) for word in words if word not in STOPWORDS]
    filtered_words = [remove_periods_in_acronyms(word) for word in filtered_words]
    filtered_words = [remove_hyphens_in_words(word) for word in filtered_words]
    if phrase_model:
        return phrase_model[filtered_words]
    return filtered_words

tknzr = TweetTokenizer()
tst = "the citroën ka-zar comic #3423423 Ka-Zar #1 (Jan 1974, Marvel) Comic First Issue Return To The Savage Land dc d.c."
#tknzr.tokenize(str_strip_accents(tst))
tokenize_and_strip_accents(tst)
#[str_strip_accents(accented_string) for accented_string in tknzr.tokenize(tst)]

ab


['citroen',
 'kazar',
 'comic',
 '#3423423',
 'kazar',
 '#1',
 'jan',
 '1974',
 'marvel',
 'comic',
 'first',
 'issu',
 'return',
 'savag',
 'land',
 'dc',
 'dc']

In [13]:
def trim_path_to_L(path, n):
    split_path = path.split(' > ')
    return ' > '.join(split_path[:n])
    
def dynamic_split_path(path):
    split_path = path.split(' > ')
    n = math.floor(len(split_path)/2)+1
    return ' > '.join(split_path[:n])

def L_minus1_split_path(path):
    split_path = path.split(' > ')
    n = len(split_path)-1
    return ' > '.join(split_path[:n])

print(dynamic_split_path("a > b > c > d"), L_minus1_split_path("a > b > c"), " > ".join("a > b".split(' > ')[:1]))

a > b > c a > b a


In [14]:

documents['doc'] = documents['title'].apply(lambda x:tokenize_and_strip_accents(x))
# documents['doc_smiple'] = documents['title'].apply(lambda x:simple_preprocess(x))
documents['L1'] = documents['category_breadcrumb'].apply(lambda path: " > ".join(path.split(' > ')[:1]))
documents['L2'] = documents['category_breadcrumb'].apply(lambda path: " > ".join(path.split(' > ')[:2]))
documents['L3'] = documents['category_breadcrumb'].apply(lambda path: " > ".join(path.split(' > ')[:3]))
documents['L4'] = documents['category_breadcrumb'].apply(lambda path: " > ".join(path.split(' > ')[:4]))
documents['L5'] = documents['category_breadcrumb'].apply(lambda path: " > ".join(path.split(' > ')[:5]))
documents['L~'] = documents['category_breadcrumb'].apply(dynamic_split_path)
documents['L-1'] = documents['category_breadcrumb'].apply(L_minus1_split_path)



In [15]:
from gensim.models.phrases import Phrases, Phraser
#phrase_model = Phrases(documents['doc'], min_count=MIN_WORD_COUNT)
phrase_model = Phrases(documents['doc'], min_count=MIN_WORD_COUNT, scoring='npmi', threshold=0.7)
documents['phrases'] = [phrase_model[words] for words in documents['doc']]
# tokenize_and_strip_accents(documents['title'], phrase_model)



In [16]:
X = documents['phrases'] if USE_PHRASES else documents['doc']
y = documents['category_breadcrumb']

In [17]:
documents.head()

Unnamed: 0,doc_id,price,title,category_breadcrumb,itemid_image_url,doc,L1,L2,L3,L4,L5,L~,L-1,phrases
0,1000001,24.95,CASE CUTLERY Single Blade Pocket Knife USMC MARINES ~ Great Gift,"Collectibles > Knives, Swords & Blades > Collectible Folding Knives > Modern Folding Knives > Factory Manufactured",https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/0UQAAOSw-2hb7KOd/$_57.JPG?set_id=880000500F,"[case, cutleri, singl, blade, pocket, knife, usmc, marin, great, gift]",Collectibles,"Collectibles > Knives, Swords & Blades","Collectibles > Knives, Swords & Blades > Collectible Folding Knives","Collectibles > Knives, Swords & Blades > Collectible Folding Knives > Modern Folding Knives","Collectibles > Knives, Swords & Blades > Collectible Folding Knives > Modern Folding Knives > Factory Manufactured","Collectibles > Knives, Swords & Blades > Collectible Folding Knives","Collectibles > Knives, Swords & Blades > Collectible Folding Knives > Modern Folding Knives","[case, cutleri, singl, blade, pocket, knife, usmc, marin, great, gift]"
1,1000002,62.95,NYC SUBWAY SIGN NY TRANSIT ART 22X6 238 STREET WHITE PLAINS ROAD BRONX ROLL SIGN,Collectibles > Transportation > Subways,https://i.ebayimg.com/00/s/MTIwMFgxNjAw/z/is8AAOSwTM5Y1m3U/$_1.JPG?set_id=8800005007,"[nyc, subway, sign, ny, transit, art, 22x6, 238, street, white, plain, road, bronx, roll, sign]",Collectibles,Collectibles > Transportation,Collectibles > Transportation > Subways,Collectibles > Transportation > Subways,Collectibles > Transportation > Subways,Collectibles > Transportation,Collectibles > Transportation,"[nyc_subway, sign, ny, transit, art, 22x6, 238, street, white, plain, road, bronx, roll, sign]"
2,1000003,39.99,Disney Store Japan Pin 40695 JDS Face Expressions Stitch Lilo & Stitch,"Collectibles > Disneyana > Contemporary (1968-Now) > Pins, Patches & Buttons > Disney Characters & Movies > Lilo & Stitch",https://i.ebayimg.com/00/s/MTYwMFgxNTIx/z/Au8AAOSwD~Fb5NqB/$_57.JPG?set_id=8800005007,"[disney, store, japan, pin, 40695, jds, face, express, stitch, lilo, stitch]",Collectibles,Collectibles > Disneyana,Collectibles > Disneyana > Contemporary (1968-Now),"Collectibles > Disneyana > Contemporary (1968-Now) > Pins, Patches & Buttons","Collectibles > Disneyana > Contemporary (1968-Now) > Pins, Patches & Buttons > Disney Characters & Movies","Collectibles > Disneyana > Contemporary (1968-Now) > Pins, Patches & Buttons","Collectibles > Disneyana > Contemporary (1968-Now) > Pins, Patches & Buttons > Disney Characters & Movies","[disney, store, japan, pin, 40695_jds, face, express, stitch, lilo_stitch]"
3,1000004,34.39,HOUSTON TEXAS *ICON CITY SERIES 2015* NASA SPACE SHUTTLE OIL Hard Rock Cafe PIN,"Collectibles > Pinbacks, Bobbles, Lunchboxes > Pinbacks > Restaurant > Hard Rock",https://i.ebayimg.com/00/s/MTIwMFgxMDQ2/z/groAAOSwF6dZcCq1/$_57.JPG?set_id=8800005007,"[houston, texa, icon, citi, seri, 2015, nasa, space, shuttl, oil, hard, rock, cafe, pin]",Collectibles,"Collectibles > Pinbacks, Bobbles, Lunchboxes","Collectibles > Pinbacks, Bobbles, Lunchboxes > Pinbacks","Collectibles > Pinbacks, Bobbles, Lunchboxes > Pinbacks > Restaurant","Collectibles > Pinbacks, Bobbles, Lunchboxes > Pinbacks > Restaurant > Hard Rock","Collectibles > Pinbacks, Bobbles, Lunchboxes > Pinbacks","Collectibles > Pinbacks, Bobbles, Lunchboxes > Pinbacks > Restaurant","[houston, texa, icon, citi, seri, 2015, nasa, space_shuttl, oil, hard, rock, cafe, pin]"
4,1000005,125.0,*Bottle #2 Prototype/Test Not For Sale Thread H5 Aluminum Coca Cola Bottle Coke,Collectibles > Advertising > Soda > Coca-Cola > Bottles,https://i.ebayimg.com/00/s/MTYwMFgxMzUz/z/6w4AAOSw5npckIo2/$_57.JPG?set_id=8800005007,"[bottl, #2, prototype/test, sale, thread, h5, aluminum, coca, cola, bottl, coke]",Collectibles,Collectibles > Advertising,Collectibles > Advertising > Soda,Collectibles > Advertising > Soda > Coca-Cola,Collectibles > Advertising > Soda > Coca-Cola > Bottles,Collectibles > Advertising > Soda,Collectibles > Advertising > Soda > Coca-Cola,"[bottl, #2, prototype/test, sale, thread, h5, aluminum, coca_cola, bottl, coke]"


In [18]:
tokenize_and_strip_accents(['carol', 'carolers', 'caroller'])

['carol', 'carol', 'carol']

## Start training word2vec model from the titles

In [20]:
import gensim
# let X be a list of tokenized texts (i.e. list of lists of tokens)
model = gensim.models.Word2Vec(X, size=150, min_count=MIN_WORD_COUNT, iter=100)
w2v = dict(zip(model.wv.index2word, model.wv.vectors))

In [21]:
(model.wv.index2word[0], w2v[model.wv.index2word[0]])

('vintag',
 array([ 0.10349678,  1.2065481 , -0.8270068 ,  0.83537453, -0.17647335,
         0.76849425,  2.1311657 , -0.9952835 , -0.34540427, -1.4363326 ,
         1.1884595 , -0.6201106 ,  1.5649587 , -0.3137793 ,  1.7367187 ,
        -1.1559738 , -0.8888798 , -0.8563675 ,  1.9936068 ,  0.04779917,
         0.7843872 ,  0.5845461 , -3.281599  , -0.17246227,  1.9859278 ,
         0.5101131 , -0.54196095,  0.45935538,  1.8948755 ,  1.1377028 ,
         0.09224258, -0.90300417,  3.0093513 ,  0.433184  , -2.168394  ,
         1.4121349 ,  3.335692  ,  0.25909176, -0.869508  , -1.0435985 ,
        -1.40455   ,  0.5381693 ,  0.5263063 , -0.28969482,  0.83507365,
        -0.92838687,  1.0109774 ,  2.3661687 ,  0.5972409 , -1.870957  ,
         0.07826   ,  0.7922129 ,  1.2302979 , -0.3247182 ,  0.24390344,
        -1.9556469 , -2.2812812 , -0.7489275 , -0.33463502, -2.2861247 ,
        -0.22314475,  1.1886561 ,  0.41866302,  1.1520487 , -0.24018112,
         0.4739796 ,  1.0864992 , -1.265

### TfIdf to get IDF information 

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
vectorizer = TfidfVectorizer(min_df = MIN_WORD_COUNT
                             , stop_words = stopwords.words('english')
                             , analyzer=lambda x: x)
vectorizer = vectorizer.fit(X)

In [23]:
idf_weights = {w:vectorizer.idf_[i] for w, i in vectorizer.vocabulary_.items()}

### DomCat model using word2vec and the TfIdf model

Reusing http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/

In [24]:
import math
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier

class W2VTreeClassifier:
    """Class to combine a W2V and a TfIdf vector and learn a tree directly on the W2V vectors and predict the cagegory"""
    
    # Initialize empty set of class labels
    _class_labels = []
    
    def __init__(self, word_vectors, idf_weights, useLogWeights=True, **kwargs):
        self._word_vectors = word_vectors
        self._idf_weights = idf_weights
        self._useLogWeights = useLogWeights
        self._words = len(self._idf_weights)
        self._log_idf_weights = {
            w:self._idfToLogIdf(idf) for w,idf in self._idf_weights.items()
        }
        self._minIdf = min([w for t,w in self._idf_weights.items()])
        self._minLogIdf = min([w for t,w in self._log_idf_weights.items()])
        self._indexToWord = [w for w,idf in self._idf_weights.items()]
        self._wordToIndex = {w:i for i,w in enumerate(self._indexToWord)}
        self._classifier = ExtraTreesClassifier(min_samples_split=10, **kwargs) #,min_samples_leaf=3,n_estimators=100,bootstrap=True)
        
    def _idfToLogIdf(self, idf):
        """Convert regular IDF value to log representation for better results"""
        df = self._words / idf
        return math.log((self._words + 0.5) / (df + 0.5))
    
    def _get_idf(self, word):
        """Get the (log) IDF of a keyword, or the MIN (log) IDF if it's not found"""
        if word in self._wordToIndex:
            return self._log_idf_weights[word]
        if self._useLogWeights:
            return self._minLogIdf
        return self._minIdf
    
    def _docToVector(self, doc):
        """Convert a document (list of words) to a vector using word2vec"""
        valid_words = [w for w in doc if w in self._word_vectors]
        if len(valid_words) == 0:
            return np.zeros(len(self._word_vectors[self._indexToWord[0]]))
        vec = np.mean([self._word_vectors[w] * self._get_idf(w) for w in valid_words]
                      , axis=0)
        if vec.size <= 1:
            vec = np.zeros(len(self._word_vectors[self._indexToWord[0]]))
        return vec
    
    def _docs_to_vectors(self, X, ignore_empty, y=None):
        """Transform the documents to vectors, optionally ignore empty vectors"""
        vectors = np.array([self._docToVector(doc) for doc in X])
        if not ignore_empty:
            return vectors
        non_zeros = [v.sum() > 0 for v in vectors]
        print("Passed documents: ", np.array([1 for b in non_zeros if b]).sum()
             , " out of: ", vectors.shape[0])
        if y is None:
            return vectors[non_zeros]
        return vectors[non_zeros], np.array(y)[non_zeros]

    def fit(self, X, y, ignore_empty=True):
        """Train the ensemble tree on the train examples in X and class labels in y"""
        train, y = self._docs_to_vectors(X, ignore_empty, y)
        self._classifier = self._classifier.fit(train, y)
        self._class_labels = self._classifier.classes_
        return self

    def predict(self, X):
        """
        Predict the class with the highest probability for every example in X
        """
        test = self._docs_to_vectors(X, False)
        return self._classifier.predict(test)

    def predict_proba(self, X, n=5):
        """
        List the top n ranking class labels per example in X
        Returns: List of sets, with for every item a set with the class label and probability
        """
        if len(self._class_labels) < n:
            n = len(self._class_labels)
        test = self._docs_to_vectors(X, False)
        probabilities = self._classifier.predict_proba(test)
        indexes = np.argsort(-probabilities, axis=1)[:,:n]
        # predictedClasses = self._classes[indexes]
        return [[dict(class_label=self._class_labels[indexes[i,j]]
                      , p=probabilities[i,indexes[i,j]]
                      , rank=j+1)
                 for j in range(n)]
                for i in range(indexes.shape[0])]

    def score(self, X, y, ignore_empty=True):
        """
        Test the model on the documents in X and labels y, 
        Ignore documents that can not be coverted
        """
        test, y = self._docs_to_vectors(X, ignore_empty, y)
        return self._classifier.score(test, y)


#### Test to get the top N classes

In [25]:
# classes = w2v_classifier._classifier.classes_
# probabilities = w2v_classifier._classifier.predict_proba([w2v['skywalker'], w2v['ka-zar']])
# indexes = np.argsort(-probabilities, axis=1)[:,:5]

# indexes.shape, classes[indexes]
#np.max(w2v_classifier.predict_log_proba([['vintage']]))
#[[dict(class_label=classes[indexes[i,j]]
#       , p=probabilities[i,indexes[i,j]]
#       , rank=j+1
#      ) for j in range(5)]
#for i in range(indexes.shape[0]) ]

### Fit the model

In [26]:
train_idx = random.sample([i for i in range(len(X))], 400000)


In [28]:
# Test without phrases
import gc
w2v_classifier = None # Free up memory
gc.collect()
#score = (W2VTreeClassifier(w2v, idf_weights,min_samples_leaf=3,n_estimators=15, bootstrap=True)
# .fit(documents['doc'][train_idx],y[train_idx])
# .score(documents['doc'][10000:20000], y[10000:20000])
#)
gc.collect()
# score

0

In [29]:
w2v_classifier = None # Free up memory
gc.collect()
w2v_classifier = (W2VTreeClassifier(w2v, idf_weights,min_samples_leaf=3,n_estimators=15, bootstrap=True)
                  .fit(X[train_idx],y[train_idx]))
w2v_classifier.score(X[10000:20000], y[10000:20000])
# Last run without filterin zeros: 0.6381

Passed documents:  212043  out of:  400000
Passed documents:  5274  out of:  10000


0.6321577550246492

In [30]:
w2v_classifier.predict([tokenize_and_strip_accents(x, phrase_model if USE_PHRASES else None) 
                        for x in ['sword steel', 'mrs queen mary', 'kazar comic'
                                  , 'ka-zar', 'the coca cola bottles'
                                  , 'harley davidson jacket']])

array([' Collectibles > Tobacciana > Lighters > Other Collectible Lighters',
       ' Collectibles > Photographic Images > Contemporary (1940-Now) > Other Contemporary Photographs',
       ' Collectibles > Transportation > Railroadiana & Trains > Paper > Books > 1900-Now',
       ' Collectibles > Animation Art & Characters > Japanese, Anime > Other Anime Collectibles',
       ' Collectibles > Advertising > Soda > Coca-Cola > Bottles',
       ' Collectibles > Transportation > Motorcycles > American > Harley-Davidson > Other Harley Collectibles'],
      dtype=object)

### Test a bit with the queries

In [31]:
queries = pd.read_csv(FILES_ROOT_PATH + 'queries.tsv', sep='\t')
queries['terms'] = queries['query'].apply(lambda x: tokenize_and_strip_accents(x, phrase_model if USE_PHRASES else None))
# queries['query'].apply(lambda x:x.split(' '))
queries['DomCat'] = w2v_classifier.predict(queries['terms'])
queries['DomCat_5'] = [json.dumps(labels) for labels in w2v_classifier.predict_proba(queries['terms'],5)]
queries.sample(50)

Unnamed: 0,query_id,query,terms,DomCat,DomCat_5
4,5,deer sculpture,"[deer, sculptur]",Collectibles > Animals > Farm & Countryside > Deer,"[{""class_label"": "" Collectibles > Animals > Farm & Countryside > Deer"", ""p"": 0.11835497835497835, ""rank"": 1}, {""class_label"": "" Collectibles > Postcards > US States, Cities & Towns > Illinois"", ""p"": 0.06153846153846154, ""rank"": 2}, {""class_label"": "" Collectibles > Decorative Collectibles > Decorative Collectible Brands > Precious Moments > Figurines > Christmas"", ""p"": 0.060000000000000005, ""rank"": 3}, {""class_label"": "" Collectibles > Animation Art & Characters > Japanese, Anime > Other Anime Collectibles"", ""p"": 0.04285714285714285, ""rank"": 4}, {""class_label"": "" Collectibles > Decorative Collectibles > Figurines > Animals"", ""p"": 0.04212121212121212, ""rank"": 5}]"
134,135,old fitzgerald,"[old, fitzgerald]",Collectibles > Historical Memorabilia > Political > US > Presidents & First Ladies > 1961-63 John F. Kennedy,"[{""class_label"": "" Collectibles > Historical Memorabilia > Political > US > Presidents & First Ladies > 1961-63 John F. Kennedy"", ""p"": 0.06190476190476191, ""rank"": 1}, {""class_label"": "" Collectibles > Postcards > US States, Cities & Towns > Virginia"", ""p"": 0.05333333333333334, ""rank"": 2}, {""class_label"": "" Collectibles > Advertising > Gas & Oil > Gas & Oil Companies > Hess > 2000-Now"", ""p"": 0.04444444444444444, ""rank"": 3}, {""class_label"": "" Collectibles > Breweriana, Beer > Playing Cards"", ""p"": 0.04444444444444444, ""rank"": 4}, {""class_label"": "" Collectibles > Historical Memorabilia > Fraternal Organizations > Masonic, Freemasonry > Aprons & Regalia"", ""p"": 0.0375, ""rank"": 5}]"
11,12,mexico carved wood,"[mexico, carv, wood]",Collectibles > Cultures & Ethnicities > Latin American > Mexico > Folk Art,"[{""class_label"": "" Collectibles > Cultures & Ethnicities > Latin American > Mexico > Folk Art"", ""p"": 0.13412698412698412, ""rank"": 1}, {""class_label"": "" Collectibles > Cultures & Ethnicities > Asian > 1900-Now > Chinese > Figures & Statues"", ""p"": 0.03703703703703704, ""rank"": 2}, {""class_label"": "" Collectibles > Tobacciana > Match Holders"", ""p"": 0.03703703703703704, ""rank"": 3}, {""class_label"": "" Collectibles > Animals > Other Animal Collectibles"", ""p"": 0.03333333333333333, ""rank"": 4}, {""class_label"": "" Collectibles > Souvenirs & Travel Memorabilia > United States > New Jersey"", ""p"": 0.02857142857142857, ""rank"": 5}]"
5,6,usmc,[usmc],Collectibles > Militaria > Militaria (Date Unknown) > Marine Corps,"[{""class_label"": "" Collectibles > Militaria > Militaria (Date Unknown) > Marine Corps"", ""p"": 0.10666666666666667, ""rank"": 1}, {""class_label"": "" Collectibles > Militaria > Current Militaria (2001-Now) > Original Items > Challenge Coins"", ""p"": 0.09910126910126911, ""rank"": 2}, {""class_label"": "" Collectibles > Arcade, Jukeboxes & Pinball > Pinball > Replacement Parts"", ""p"": 0.05454545454545455, ""rank"": 3}, {""class_label"": "" Collectibles > Militaria > Current Militaria (2001-Now) > Original Items > Medals & Ribbons"", ""p"": 0.052121212121212124, ""rank"": 4}, {""class_label"": "" Collectibles > Paper > Menus"", ""p"": 0.05, ""rank"": 5}]"
67,68,penny machine,"[penni, machin]","Collectibles > Banks, Registers & Vending > Vending Machines > Gumball","[{""class_label"": "" Collectibles > Banks, Registers & Vending > Vending Machines > Gumball"", ""p"": 0.2120634920634921, ""rank"": 1}, {""class_label"": "" Collectibles > Casino > Slots > Replacement Parts"", ""p"": 0.06666666666666667, ""rank"": 2}, {""class_label"": "" Collectibles > Fantasy, Mythical & Magic > Magic > Tricks"", ""p"": 0.06666666666666667, ""rank"": 3}, {""class_label"": "" Collectibles > Banks, Registers & Vending > Still, Piggy Banks > Other Collectible Still Banks"", ""p"": 0.05333333333333334, ""rank"": 4}, {""class_label"": "" Collectibles > Advertising > Merchandise & Memorabilia > Premiums, Prizes & Charms"", ""p"": 0.0453968253968254, ""rank"": 5}]"
91,92,angel ornaments,"[angel, ornament]",Collectibles > Holiday & Seasonal > Christmas: Current (1991-Now) > Ornaments > Angels,"[{""class_label"": "" Collectibles > Holiday & Seasonal > Christmas: Current (1991-Now) > Ornaments > Angels"", ""p"": 0.22369454619454618, ""rank"": 1}, {""class_label"": "" Collectibles > Decorative Collectibles > Decorative Collectible Brands > Hallmark > Ornaments: by Series > Mary's Angels"", ""p"": 0.10284391534391535, ""rank"": 2}, {""class_label"": "" Collectibles > Holiday & Seasonal > Christmas: Current (1991-Now) > Ornaments > Other Current Tree Ornaments"", ""p"": 0.08299015799015799, ""rank"": 3}, {""class_label"": "" Collectibles > Decorative Collectibles > Ornaments"", ""p"": 0.06601565101565102, ""rank"": 4}, {""class_label"": "" Collectibles > Non-Sport Trading Cards > Wacky Packages > Trading Card Singles"", ""p"": 0.05714285714285714, ""rank"": 5}]"
84,85,military police patches,"[militari, polic, patch]",Collectibles > Militaria > Current Militaria (2001-Now) > Original Items > Patches > Army,"[{""class_label"": "" Collectibles > Militaria > Current Militaria (2001-Now) > Original Items > Patches > Army"", ""p"": 0.16511360099595393, ""rank"": 1}, {""class_label"": "" Collectibles > Historical Memorabilia > Police > Patches > International"", ""p"": 0.1343005143005143, ""rank"": 2}, {""class_label"": "" Collectibles > Historical Memorabilia > Police > Hats & Caps"", ""p"": 0.10443463943463943, ""rank"": 3}, {""class_label"": "" Collectibles > Militaria > Surplus > Patches"", ""p"": 0.10200077700077699, ""rank"": 4}, {""class_label"": "" Collectibles > Historical Memorabilia > Police > Other Police Collectibles"", ""p"": 0.04444444444444444, ""rank"": 5}]"
88,89,absolut vodka bottle,"[absolut, vodka, bottl]",Collectibles > Advertising > Food & Beverage > Distillery > Absolut,"[{""class_label"": "" Collectibles > Advertising > Food & Beverage > Distillery > Absolut"", ""p"": 0.13491871962460197, ""rank"": 1}, {""class_label"": "" Collectibles > Religion & Spirituality > Christianity > Medals"", ""p"": 0.06666666666666667, ""rank"": 2}, {""class_label"": "" Collectibles > Radio, Phonograph, TV, Phone > Radios > Transistor Radios"", ""p"": 0.06666666666666667, ""rank"": 3}, {""class_label"": "" Collectibles > Transportation > Bicycles > Bicycle Parts"", ""p"": 0.06666666666666667, ""rank"": 4}, {""class_label"": "" Collectibles > Barware > Shot Glasses"", ""p"": 0.03333333333333333, ""rank"": 5}]"
124,125,national cash register,"[nation, cash, regist]","Collectibles > Banks, Registers & Vending > Cash Registers","[{""class_label"": "" Collectibles > Banks, Registers & Vending > Cash Registers"", ""p"": 0.9486964886964887, ""rank"": 1}, {""class_label"": "" Collectibles > Advertising > Computers & High Tech"", ""p"": 0.019047619047619046, ""rank"": 2}, {""class_label"": "" Collectibles > Pens & Writing Instruments > Pens > Fountain Pens > Montblanc"", ""p"": 0.01818181818181818, ""rank"": 3}, {""class_label"": "" Collectibles > Autographs > Historical"", ""p"": 0.014074074074074074, ""rank"": 4}, {""class_label"": "" Collectibles > Advertising > Agriculture"", ""p"": 0.0, ""rank"": 5}]"
92,93,piggy piggy banks,"[piggi, piggi, bank]","Collectibles > Banks, Registers & Vending > Still, Piggy Banks > Animals > Pigs","[{""class_label"": "" Collectibles > Banks, Registers & Vending > Still, Piggy Banks > Animals > Pigs"", ""p"": 0.31990009990009993, ""rank"": 1}, {""class_label"": "" Collectibles > Decorative Collectibles > Figurines > Clowns, Circuses"", ""p"": 0.09444444444444446, ""rank"": 2}, {""class_label"": "" Collectibles > Banks, Registers & Vending > Still, Piggy Banks > Other Collectible Still Banks"", ""p"": 0.07599021893139539, ""rank"": 3}, {""class_label"": "" Collectibles > Banks, Registers & Vending > Still, Piggy Banks > Characters"", ""p"": 0.07371794871794872, ""rank"": 4}, {""class_label"": "" Collectibles > Banks, Registers & Vending > Still, Piggy Banks > Animals > Dogs"", ""p"": 0.05555555555555556, ""rank"": 5}]"


In [32]:
queries.to_csv(FILES_ROOT_PATH + 'queries_with_labels.tsv', sep='\t', encoding='utf-8')

In [33]:
exploded_domcat = pd.DataFrame([(r['query'], labels['class_label'], labels['p'])
                                for i,r in queries.iterrows()
                                for labels in json.loads(r['DomCat_5'])
                               ], columns=['query', 'category_breadcrumb', 'score'])
exploded_domcat.to_csv(FILES_ROOT_PATH + 'queries_with_domcat_5.tsv', sep='\t', encoding='utf-8')
exploded_domcat.head()

Unnamed: 0,query,category_breadcrumb,score
0,kazar comic,Collectibles > Transportation > Railroadiana & Trains > Paper > Books > 1900-Now,0.086667
1,kazar comic,"Collectibles > Animation Art & Characters > Japanese, Anime > Other Anime Collectibles",0.070833
2,kazar comic,Collectibles > Historical Memorabilia > Political > US > Presidents & First Ladies > 1961-63 John F. Kennedy,0.068889
3,kazar comic,Collectibles > Sewing (1930-Now) > Spools & Thread,0.053333
4,kazar comic,"Collectibles > Radio, Phonograph, TV, Phone > Radios > Transistor Radios",0.033333


### Synoyms for the queries

In [40]:
query_terms = set([term for terms in queries.terms for term in terms])
synonyms = pd.DataFrame([(word, match[0], match[1]) 
            #for word in model.wv.index2word 
                         for word in query_terms
            for match in model.wv.most_similar(word) if match[1] > MIN_SIMILARITY_FOR_SYNONYMS]
                        , columns=['word', 'synonym', 'similarity'])
    
synonyms[0:20]

Unnamed: 0,word,synonym,similarity
0,gnome,cairn,0.736252
1,grinder,herb,0.722598
2,zippo,lighter,0.72383
3,cgc,cbcs,0.879096
4,vader,vadar,0.735752
5,vader,maul,0.735582
6,vader,sidious,0.7246
7,hmmwv,m998,0.911901
8,hmmwv,m1151,0.701889
9,precious,prescious,0.784125


In [35]:
#pd.DataFrame([json.dumps(s) for s in synonyms], columns=['synonyms']).to_csv(FILES_ROOT_PATH + 'synonyms_90.csv')
synonyms.to_csv(FILES_ROOT_PATH + 'synonyms_80.csv')

In [36]:
word_to_cluster, cluster_to_synonyms = collapse_synonyms(synonyms, 'word', 'synonym')

In [37]:
syns_for_output = [','.join([e for e in synset if not re.match('^[#a-z]?[0-9]+|[0-9]+[a-z]?[a-z]?$', e)]) 
                   for k,synset in cluster_to_synonyms.items()]
pd.DataFrame([synset for synset in syns_for_output if synset and ',' in synset]
             , columns=['flat_synonyms']).to_csv(FILES_ROOT_PATH + 'flat_synonyms.txt'
                    , index_label=False
                    , index=False
                    , header=False
                    , quoting = csv.QUOTE_NONE
                    , sep='|'
                    , quotechar="",  escapechar="\\"
                    , columns=['flat_synonyms']
                    , encoding='UTF8')

#### Loop it together to create different category level predictions

In [38]:
train_idx = random.sample([i for i in range(len(X))], 800000)
levels = ['category_breadcrumb', 'L1', 'L2', 'L3', 'L4', 'L5', 'L~', 'L-1']
# levels = ['L5', 'L~', 'L-1']
w2v_classifier = None # Free up memory
gc.collect()
w2v_classifier = W2VTreeClassifier(w2v, idf_weights,min_samples_leaf=3,n_estimators=15, bootstrap=True)
# Last run without filterin zeros: 0.6381
for ilevel, level in enumerate(levels):
    Y = documents[level]
    print(level)
    w2v_classifier = w2v_classifier.fit(X[train_idx],Y[train_idx])
    columnName = 'DomCat_10_' + level
    queries[columnName] = [json.dumps(labels) for labels in w2v_classifier.predict_proba(queries['terms'],10)]
    print("Score at level {}: {}".format(level, w2v_classifier.score(X[10000:20000], Y[10000:20000])))
    
    # Write column to separate file
    exploded_domcat = pd.DataFrame([(r['query'], labels['class_label'], labels['p'])
                                for i,r in queries.iterrows()
                                for labels in json.loads(r[columnName])
                               ], columns=['query', level, 'score'])
    exploded_domcat.to_csv(FILES_ROOT_PATH + 'queries_with_' + columnName.replace('~', '_') + '.tsv'
                           , sep='\t', encoding='utf-8')

queries.to_csv(FILES_ROOT_PATH + 'queries_with_labels.tsv', sep='\t', encoding='utf-8')

category_breadcrumb
Passed documents:  211898  out of:  400000
Passed documents:  5274  out of:  10000
Score at level category_breadcrumb: 0.6414486158513463
L1
Passed documents:  211898  out of:  400000
Passed documents:  5274  out of:  10000
Score at level L1: 1.0
L2
Passed documents:  211898  out of:  400000
Passed documents:  5274  out of:  10000
Score at level L2: 0.7950322335987865
L3
Passed documents:  211898  out of:  400000
Passed documents:  5274  out of:  10000
Score at level L3: 0.7178612059158134
L4
Passed documents:  211898  out of:  400000
Passed documents:  5274  out of:  10000
Score at level L4: 0.6710276829730755
L5
Passed documents:  211898  out of:  400000
Passed documents:  5274  out of:  10000
Score at level L5: 0.643344709897611
L~
Passed documents:  211898  out of:  400000
Passed documents:  5274  out of:  10000
Score at level L~: 0.7237390974592339
L-1
Passed documents:  211898  out of:  400000
Passed documents:  5274  out of:  10000
Score at level L-1: 0.69984

In [2]:
levels = ['category_breadcrumb', 'L1', 'L2', 'L3', 'L4', 'L5', 'L~', 'L-1']

for level in levels:
    columnName = 'DomCat_10_' + level
    exploded_domcat = pd.DataFrame([(r['query'], labels['class_label'], labels['p'])
                                for i,r in queries.iterrows()
                                for labels in json.loads(r[columnName])
                               ], columns=['query', level, 'score'])
    exploded_domcat.to_csv(FILES_ROOT_PATH + 'queries_with_' + columnName.replace('~', '_') + '.tsv'
                           , sep='\t', encoding='utf-8')


NameError: name 'pd' is not defined

## Work in progress

In [None]:
import elasticsearch

In [None]:
queries

In [29]:
# if a word was never seen - it must be at least as infrequent
# as any of the known words - so the default idf is the max of 
# known idf's
max_idf = max(vectorizer.idf_)

In [30]:
#classifier = classifier.fit(doc_vectors[0:10000],y[0:10000])

In [31]:
import math
from scipy import sparse

class TfIdfTreeClassifier:
    """Class to construct a TfIdf vector and learn tree directly on the input vectors"""
    
    def __init__(self, idf_weights, useLogWeights=True):
        self._idf_weights = idf_weights
        self._useLogWeights = useLogWeights
        self._words = len(self._idf_weights)
        self._log_idf_weights = {
            w:self._idfToLogIdf(idf) for w,idf in self._idf_weights.items()
        }
        self._maxIdf = max([w for t,w in self._idf_weights.items()])
        self._maxLogIdf = max([w for t,w in self._log_idf_weights.items()])
        self._indexToWord = [w for w,idf in self._idf_weights.items()]
        self._wordToIndex = {w:i for i,w in enumerate(self._indexToWord)}
        self._classifier = ExtraTreesClassifier(min_samples_split=10,min_samples_leaf=3,n_estimators=100,bootstrap=True)
        
    def _idfToLogIdf(self, idf):
        df = self._words / idf
        return math.log((self._words + 0.5) / (df + 0.5))
    
    def _getIdf(self, word):
        if word in self._wordToIndex:
            idx = self._wordToIndex[word]
            if self._useLogWeights:
                idf = self._log_idf_weights[word]
            return idx, idf
        if self._useLogWeights:
            return self._words, self._maxLogIdf
        return self._words, self._maxIdf
    
    def _docToVector(self, doc):
        """Convert a document (list of words) to a sparse matrix (vector)"""
        a_idx = np.zeros(len(doc))
        a_idf = np.zeros(len(doc))
        for i in range(len(doc)):
            w = doc[i]
            idx, idf = self._getIdf(w)
            a_idx[i] = idx
            a_idf[i] = idf
        return sparse.csc_matrix((a_idf,(a_idx, np.zeros(len(a_idx)))),shape=(self._words+1,1))

    def fit(self, X, y):
        train = [self._docToVector(doc) for doc in X]
        self._classifier = self._classifier.fit(train, y)
        return self

    def predict(self, X):
        test = [self._docToVector(doc) for doc in X]
        return self._classifier.predict(test)

    def predict_log_proba(self, X):
        test = [self._docToVector(doc) for doc in X]
        return self._classifier.predict_log_proba(test)        

    def score(self, X, y):
        test = [self._docToVector(doc) for doc in X]
        return self._classifier.score(test, y)
        

    

In [32]:
tfIdfTreeClassifier = TfIdfTreeClassifier(idf_weights)
tfIdfTreeClassifier = tfIdfTreeClassifier.fit(X[:1000], y[:1000])

ValueError: setting an array element with a sequence.

In [None]:
idf_weights['knive']
[(i,w) for i,w in enumerate([0,1,2,0,1])]