In [2]:
from elasticsearch import Elasticsearch
import pandas as pd
import math
import codecs
import numpy as np


In [3]:
FILES_ROOT_PATH = 'C:/Users/cpieterse/OneDrive - eBay Inc/High accuracy recall/'

## Relevant ads per query

In [4]:
df_labels = pd.read_csv('results/relevant_pairs.tsv', sep='\t')
# df_labels['relevant']=int(1)
df_labels.head()

Unnamed: 0.1,Unnamed: 0,adid,queryid,relevant
0,0,1000015,119,1
1,1,1000055,134,-1
2,2,1000061,24,-1
3,3,1000075,111,1
4,4,1000082,142,-1


## Matched ads per query

In [5]:
# Use same fields as the indexing
QUERY_FIELD_STRATEGIES = {"title.title_shingles":"my_shingle_analyzer"
                          , "title.title_stems":"my_stemmer_analyzer"
                          , "title.title_tokens":"my_analyzer"
                          , "title_category.stems":"my_stemmer_analyzer"
                          , "title_customized.stems":"my_SB_stemmer_analyzer"
                          , "title_category_customized.stems":"my_SB_stemmer_analyzer"}
QUERY_FIELDS = [field_to_use for field_to_use in QUERY_FIELD_STRATEGIES]

JOIN_FIELDS = ['queryid', 'adid', 'category', 'query_tokens']

df_query_match = None
for i, field_to_use in enumerate(QUERY_FIELDS):
    doc_path = "results/found_ads_" + field_to_use + ".tsv"
    if i == 0:
        df_query_match = pd.read_csv(doc_path, sep='\t')
        df_query_match['score_' + field_to_use] = df_query_match['score']
        continue
    df_query_match = pd.merge(left=df_query_match
                              , right=pd.read_csv(doc_path, sep='\t', usecols=JOIN_FIELDS + ['score'])
                              , on=JOIN_FIELDS
                              , how='outer'
                              , suffixes=['', '_' + field_to_use])
    
# df_query_match = pd.read_csv(doc_path, sep='\t')

In [6]:
df_exact_match = pd.read_csv('results/similar_listings_5_7.tsv'
                             , sep='\t'
                            , header=0
                            , names=['queryid', 'adid', 'score', 'doc', 'query', 'category', 'title'])
df_exact_match.head()

Unnamed: 0,queryid,adid,score,doc,query,category,title
0,1,1105337,14.690313,"{'id': '1105337', 'title': 'kazar: flashback #...",kazar comic,"Collectibles > Comics > Graphic Novels, TPBs",kazar: flashback #1 & kazar the savage #1 col...
1,1,1757740,14.537307,"{'id': '1757740', 'title': 'ka-zar kazar #2 ma...",kazar comic,Collectibles > Comics > Full Runs & Sets,ka-zar kazar #2 marvel comics vintage comic bo...
2,1,1276829,14.509985,"{'id': '1276829', 'title': 'ka-zar #1 cgc 9.8 ...",kazar comic,Collectibles > Comics > Other Comic Collectibles,ka-zar #1 cgc 9.8 marvel 1974 investment! kaza...
3,1,1733264,14.49975,"{'id': '1733264', 'title': 'comic kazar #1 c...",kazar comic,Collectibles > Comics > Collections,comic kazar #1 collectibles > comics > colle...
4,1,1480116,14.416349,"{'id': '1480116', 'title': 'bowen designs ka-z...",kazar comic,Collectibles > Comics > Figurines,bowen designs ka-zar marvel comics statue ka-z...


In [247]:
# For now, indexing issues at my machine
# df_query_match = df_exact_match 

In [7]:
df_labeled_results = pd.merge(df_query_match,df_labels, on=['queryid', 'adid'], how='left')
df_labeled_results['relevant'] = df_labeled_results['relevant'].apply(lambda x: 0 if np.isnan(x) else x)

In [9]:
print(df_labeled_results.shape, df_query_match.shape)

(374142, 15) (374142, 13)


In [10]:
df_labeled_results.head()

Unnamed: 0.1,queryid,adid,score,query,query_tokens,category,title,score_title.title_shingles,score_title.title_stems,score_title.title_tokens,score_title_category.stems,score_title_customized.stems,score_title_category_customized.stems,Unnamed: 0,relevant
0,1,1107250,46.6181,kazar comic,2,collectibles > comics > bronze age (1970-83) ...,Kazar Comic Lot Of 12,46.6181,15.481665,15.055187,12.901182,15.762724,13.012967,,0.0
1,1,1514206,38.877388,kazar comic,2,collectibles > comics > modern age (1992-now)...,Kazar Comic Book Lot 2 & 16,38.877388,15.159836,14.702044,12.978188,15.10162,12.945242,,0.0
2,1,1071866,37.764427,kazar comic,2,collectibles > comics > bronze age (1970-83) ...,kazar comic book # 1 MARVEL,37.764427,15.481665,15.055187,12.901182,15.441545,12.869501,,0.0
3,1,1632868,36.679214,kazar comic,2,collectibles > comics > bronze age (1970-83) ...,MARVEL TALES #27 VF- 7.5 SPIDER-MAN KAZAR COMI...,36.679214,14.249121,13.683632,12.621739,14.401587,12.540431,,0.0
4,1,1660096,36.605736,kazar comic,2,"collectibles > comics > graphic novels, tpbs",MARVEL MARVEL SUPERHEROES PRESENTS #19 KAZAR C...,36.605736,14.634363,14.120526,13.306538,14.559256,13.160347,,0.0


In [11]:
def trim_path_to_L(path, n):
    split_path = path.split(' > ')
    return ' > '.join(split_path[:n])
    
def dynamic_split_path(path):
    split_path = path.split(' > ')
    n = math.floor(len(split_path)/2)+1
    return ' > '.join(split_path[:n])

def L_minus1_split_path(path):
    split_path = path.split(' > ')
    n = len(split_path)-1
    return ' > '.join(split_path[:n])

if 'category_L~' not in df_labeled_results.columns:
    print('Regenerating columns')
    df_labeled_results['category_L1'] = df_labeled_results['category'].apply(lambda x: trim_path_to_L(x.lower(), 1))
    df_labeled_results['category_L2'] = df_labeled_results['category'].apply(lambda x: trim_path_to_L(x.lower(), 2))
    df_labeled_results['category_L3'] = df_labeled_results['category'].apply(lambda x: trim_path_to_L(x.lower(), 3))
    df_labeled_results['category_L4'] = df_labeled_results['category'].apply(lambda x: trim_path_to_L(x.lower(), 4))
    df_labeled_results['category_L5'] = df_labeled_results['category'].apply(lambda x: trim_path_to_L(x.lower(), 5))
    df_labeled_results['category_L-1'] = df_labeled_results['category'].apply(lambda x: L_minus1_split_path(x.lower()))
    df_labeled_results['category_L~'] = df_labeled_results['category'].apply(lambda x: dynamic_split_path(x.lower()))


Regenerating columns


## Query features

In [12]:
levels = ['category_breadcrumb', 'L2','L3','L4','L5', 'L~']
# levels = ['L~']

pd_querylkp = pd.read_csv(FILES_ROOT_PATH + 'queries.tsv', sep='\t')

for level in levels:
    columnName = 'DomCat_10_' + level
    pd_domcat = pd.merge(pd.read_csv(FILES_ROOT_PATH + 'queries_with_' + columnName.replace('~', '_') + '.tsv'
                            , sep='\t', encoding='utf-8')
                         , pd_querylkp
                         , on=['query'], how='inner')
    pd_domcat[level + '_score'] = pd_domcat['score']
    target_column = 'category' if level == 'category_breadcrumb' else 'category_' + level
    pd_domcat[target_column] = pd_domcat[level].map(lambda x: x.lower())
    pd_domcat['queryid'] = pd_domcat['query_id']
    pd_domcat = pd_domcat[['queryid', target_column, level + '_score']]
    df_labeled_results = pd.merge(df_labeled_results, pd_domcat
                                  , on=['queryid', target_column], how='left')
    



In [13]:
df_labeled_results.loc[df_labeled_results['queryid']==1].groupby(['relevant']).count()
# 19763/(19763+127952)
# Previous query 1 for Or's set: 0: 3020, 1: 481
# New results: 0:3021, 1:483

Unnamed: 0_level_0,queryid,adid,score,query,query_tokens,category,title,score_title.title_shingles,score_title.title_stems,score_title.title_tokens,...,category_L4,category_L5,category_L-1,category_L~,category_breadcrumb_score,L2_score,L3_score,L4_score,L5_score,L~_score
relevant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1.0,67,67,0,0,67,67,0,0,67,67,...,67,67,67,67,33,66,53,53,29,59
0.0,2954,2954,10,10,2954,2954,10,10,2946,2946,...,2954,2954,2954,2954,1525,2945,2588,2475,1594,2893
1.0,483,483,0,0,483,483,0,0,481,481,...,483,483,483,483,271,483,431,415,286,480


In [14]:
df_labeled_results.to_csv('results/labeled_results.tsv', sep='\t')

# Naive threshold to improve F1

In [15]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

def best_threshold(df, column, verbose=False):
    min_t = df[column].min()
    max_t = df[column].max()
    best_score = dict(t=min_t, score=-100)
    real_pos = list(df['relevant'])
    
    for i in range(100):
        t = min_t + i * (max_t-min_t)/(100-1)
        pred_pos = list(df[column].apply(lambda x: 1 if x>=t else 0))
        score = f1_score(real_pos,pred_pos, pos_label=1)
        if score > best_score['score']:
            best_score = dict(t=t
                              , score=score
                              , recall=recall_score(real_pos,pred_pos)
                              , accuracy=accuracy_score(real_pos,pred_pos)
                              , precision=precision_score(real_pos,pred_pos))
            if verbose:
                print(best_score)
    return best_score
    


In [16]:
best_threshold(df_labeled_results, 'score', True)

ValueError: Target is multiclass but average='binary'. Please choose another average setting.

In [None]:
best_threshold(df_labeled_results, 'L2_score', True)

In [None]:
df_labeled_results['norm_title_category_customized.stems'] = df_labeled_results['score_title_category_customized.stems'] / df_labeled_results['query_tokens']
best_threshold(df_labeled_results, 'norm_title_category_customized.stems', True)

In [261]:
df_labeled_results.columns

Index(['queryid', 'adid', 'score', 'query', 'query_tokens', 'category',
       'title', 'score_title.title_shingles', 'score_title.title_stems',
       'score_title.title_tokens', 'score_title_category.stems',
       'score_title_customized.stems', 'score_title_category_customized.stems',
       'Unnamed: 0', 'relevant', 'category_L1', 'category_L2', 'category_L3',
       'category_L4', 'category_L5', 'category_L-1', 'category_L~',
       'category_breadcrumb_score', 'L2_score', 'L3_score', 'L4_score',
       'L5_score', 'L~_score'],
      dtype='object')

In [258]:
df_labeled_results['relevant'].sum(), df_labeled_results['relevant'].count()

(27688, 374142)

In [259]:
list(df_labeled_results['relevant'].apply(lambda x: 'POS' if x > 0 else 'NEG').unique())

['NEG', 'POS']