In [1]:
from __future__ import print_function, unicode_literals

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc='progress')

from sklearn.preprocessing import LabelEncoder, MaxAbsScaler, QuantileTransformer
from sklearn.model_selection import train_test_split

import os
#import pickle
#import hashlib
import string
import unicodedata
import re
import gc
import time
import math
import torchtext
import wordbatch
from wordbatch.extractors import WordSeq

os.environ['OMP_NUM_THREADS'] = '4'

from collections import defaultdict, OrderedDict, Counter
#from nltk.corpus import stopwords
#from spacy.lang.en.stop_words import STOP_WORDS
from itertools import chain
num_partitions = 8
num_cores = 4
from multiprocessing import Pool, cpu_count

In [2]:
import gc
gc.collect()

0

### Text processing and tokenization

In [3]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
stop_words = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 
              'there', 'about', 'once', 'during', 'out', 'very', 'having', 
              'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 
              'its', 'yours', 'such', 'into', 'most', 'itself', 'other', 
              'off', 'is', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 
              'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 
              'through', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 
              'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 
              'ours', 'had', 'she', 'all', 'when', 'at', 'any', 'before', 'them',
              'same', 'and', 'been', 'have', 'in', 'will', 'does', 'yourselves', 
              'then', 'that', 'because', 'what', 'over', 'why’, ‘so', 'can', 'did',
              'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only',
              'myself', 'which', 'those', 'i','after', 'few', 'whom', 'being', 'if', 
              'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than']

def unicodeToAscii(s):
    return  unicodedata.normalize('NFKC', s)

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"'", r"", s)
    s = re.sub(r"[.!?':;,]", r" ", s)
    s = re.sub(r"-", r"", s)
    s = re.sub(r"[^0-9a-zA-Z.!?]+", r" ", s)
    s = re.sub(r" 0 ", r"zero", s)
    s = re.sub(r" 1 ", r"one", s)
    s = re.sub(r" 2 ", r"two", s)
    s = re.sub(r" 3 ", r"three", s)
    s = re.sub(r" 4 ", r"four", s)
    s = re.sub(r" 5 ", r"five", s)
    s = re.sub(r" 6 ", r"six", s)
    #s = re.sub(r"7", r"seven", s)
    #s = re.sub(r"8", r"eight", s)
    #s = re.sub(r"/s/s", r"/s", s)
    return s

def _normalize_and_ngrams(sent, ngram):
    input_list = normalizeString(sent).split()
    input_list = [word for word in input_list if word not in stop_words]
    #s = input_list.copy()
    #for i in range(2, ngrams+1):
    #    s += [' '.join(input_list[j:j+i]) for j in range(len(input_list)-i + 1)]
        #s += list((zip(*[input_list[j:] for j in range(i)])))
    s = [''.join(input_list[i:i+ngram]) for i in range(len(input_list))]
    return ' '.join(s[:-1])

#tmp = "I am not a dance'r and i am a 6ixy   c-o:d;er programmer"
#print(normalizeString(tmp))
#print(_normalize_and_ngrams(tmp, 3))

class Vocab_topwords():
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.index2word = {}
        
    def fit_data(self, data, col, ngrams=3, max_features=50000):
        c = Counter(list(chain.from_iterable(data[col].tolist())))
        for i, (w, count) in enumerate(c.most_common(max_features)):
            self.word2index[w] = i
        return
    

            
            
def prepareVocab(name, data, max_features):
    vocab = Vocab_topwords(name)
    vocab.fit_data(data, name, max_features=max_features)
    
    print("Counted words:")
    print(vocab.name, len(vocab.word2index))
    return vocab


# Extra features

In [4]:
def get_cat_1(x): return str(x).split('/')[0]
def get_cat_2(x): return str(x).split('/')[1] if len(str(x).split('/')) > 1 else -1
def get_cat_3(x): return ' '.join(str(x).split('/')[2:]) if len(str(x).split('/')) > 2 else -1

def applycat1(df): 
    return df['category_name'].progress_apply(get_cat_1)
    

def applycat2(df): 
    return df['category_name'].progress_apply(get_cat_2)
    

def applycat3(df): 
    return df['category_name'].progress_apply(get_cat_3)

def get_words(series): return series.progress_apply(lambda x: len(str(x).split()))

def get_chars(series): return series.progress_apply(lambda x: len(str(x)))

def get_tokens(series): return np.sum(np.array(series.tolist()) > 0, axis=1)

def isphonecase(series): return series.str.contains(' case ', flags=re.IGNORECASE).astype(int)

def isiphone6(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('6|six', flags=re.IGNORECASE) &
                        ~(series.str.contains('plus|\+', flags=re.IGNORECASE)) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) ).astype(int)

def isiphone6p(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('6|six', flags=re.IGNORECASE) &
                        series.str.contains('plus|\+', flags=re.IGNORECASE) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) ).astype(int)

def isiphone5(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('5|five', flags=re.IGNORECASE) &
                        ~(series.str.contains('plus|\+', flags=re.IGNORECASE)) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) ).astype(int)

def isiphone5p(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('5|five', flags=re.IGNORECASE) &
                        series.str.contains('plus|\+', flags=re.IGNORECASE) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) ).astype(int)

def isiphone7(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('7|seven', flags=re.IGNORECASE) &
                        ~(series.str.contains('plus|\+', flags=re.IGNORECASE)) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) ).astype(int)

def isiphone7p(series): return (series.str.contains('iphone', flags=re.IGNORECASE) & 
                        series.str.contains('7|seven', flags=re.IGNORECASE) &
                        series.str.contains('plus|\+', flags=re.IGNORECASE) &
                                ~(series.str.contains('case', flags=re.IGNORECASE)) ).astype(int)

def isunlocked(series): return series.str.contains('unlocked', flags=re.IGNORECASE).astype(int)

def plussigns(series): return series.apply(lambda x: sum([(s == '+') | (s == '➕') for s in str(x)]))

def andsigns(series): return series.apply(lambda x: sum([(s == '&') | (s == ' and ') for s in str(x)]))

def commas(series): return series.apply(lambda x: sum([s == ',' for s in str(x)]))

def add_ngrams(text, ngram=2):
    word_list = normalizeString(text).split(' ')
    out_list = [''.join(word_list[i:i+ngram]) for i in range(len(word_list))]
    return ' '.join(out_list[:-1])

def get_2grams(series): return series.apply(lambda x: _normalize_and_ngrams(str(x), 2))

def norm3grams(s): return _normalize_and_ngrams(s, 3)

def applyname(series): return series.progress_apply(norm3grams)

def index2sent1(x, name_vocab): return indexesFromSentence(name_vocab, x, 3, 10)

def name2index(series): return series.progress_apply(lambda x: index2sent1(x, name_vocab))

def norm2grams(s): return _normalize_and_ngrams(s, 1)

def applydesc(series):return series.progress_apply(norm2grams)

def index2sent2(x, desc_vocab): return indexesFromSentence(desc_vocab, x, 1, 80)

def desc2index(series): return series.progress_apply(lambda x: index2sent2(x, desc_vocab))

def indexesFromSentence(vocab, tokens, ngrams, max_len):
    num_list = []
    for i, item in enumerate(tokens):
        if len(num_list) == max_len:
            break
        elif item in vocab.word2index:
            num_list.append(vocab.word2index[item])
        else:
            continue
        
    if len(num_list) < max_len :
        num_list += [0]*(max_len - len(num_list) )
        
    return num_list

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df


def read_data(in_path, out_path):
    if os.path.exists(os.path.join(out_path, 'train_2.pkl')) and os.path.exists(os.path.join(out_path, 'test_2.pkl')):
        train_data = pd.read_pickle(os.path.join(out_path, 'train_2.pkl'))
        test_data  = pd.read_pickle(os.path.join(out_path, 'test_2.pkl'))
        
        return train_data, test_data
    
    else:
        train_data = pd.read_table(os.path.join(in_path, 'train.tsv'))
        test_data  = pd.read_table(os.path.join(in_path, 'test.tsv'))
    
        train_rows = len(train_data)
        data = pd.concat([train_data, test_data], ignore_index=True)
    
        data['cat1'] = parallelize_dataframe(data[['category_name']], applycat1)
        data['cat2'] = parallelize_dataframe(data[['category_name']], applycat2)
        data['cat3'] = parallelize_dataframe(data[['category_name']], applycat3)
        data.fillna(-1, inplace=True)
        
        print("Getting word/char len features")
        data['desc_words'] = parallelize_dataframe(data['item_description'], get_words)
        data['desc_chars'] = parallelize_dataframe(data['item_description'], get_chars)
        data['name_words'] = parallelize_dataframe(data['name'], get_words)
        data['name_chars'] = parallelize_dataframe(data['name'], get_chars)
        
        
        print("Get iphone features")
        data['iphone_case'] = parallelize_dataframe(data['name'], isphonecase)
        data['iphone6'] = parallelize_dataframe(data['name'], isiphone6)
        data['iphone6p'] = parallelize_dataframe(data['name'], isiphone6p)
        data['iphone5'] = parallelize_dataframe(data['name'], isiphone5)
        data['iphone5p'] = parallelize_dataframe(data['name'], isiphone5p)
        data['iphone7'] = parallelize_dataframe(data['name'], isiphone7)
        data['iphone7p'] = parallelize_dataframe(data['name'], isiphone7p)
        data['unlocked_phone'] = parallelize_dataframe(data['name'], isunlocked)
        
        print("Label encoding features")
        cat_cols = ['category_name', 'brand_name', 'cat1', 'cat2', 'cat3', 'item_condition_id']
        for col in cat_cols:
            data[col] = LabelEncoder().fit_transform(data[col].astype(str)) + 1
            
        print("Get count features")
        data['brand_counts'] = data.brand_name.map(data["brand_name"].value_counts()).fillna(0).astype(int)

        data['cat_counts'] = data.brand_name.map(data["category_name"].value_counts()).fillna(0).astype(int)
        
        data['cat1_counts'] = data.brand_name.map(data["cat1"].value_counts()).fillna(0).astype(int)

        data['cat2_counts'] = data.brand_name.map(data["cat2"].value_counts()).fillna(0).astype(int)

        data['cat3_counts'] = data.brand_name.map(data["cat3"].value_counts()).fillna(0).astype(int)
  
        
        print("Getting punct related features")
        data["plus_counts"] = parallelize_dataframe(data["item_description"], plussigns)
        data["ands_counts"] = parallelize_dataframe(data["item_description"], andsigns)
        data["comma_counts"] = parallelize_dataframe(data["item_description"], commas)
        data["all_counts"] = data["plus_counts"] + data["ands_counts"] + data["comma_counts"]
        
        #for col in ["name", "item_description"]:
        #    data[col] = data[col].str.replace("'", '').replace('-', '').progress_apply(unicodeToAscii)
        #    data[col] = data[col].progress_apply(remove_puncts)
        
        
        num_cols =  ["desc_words", "desc_chars", "name_words", "name_chars", "plus_counts", 
                    "ands_counts", "comma_counts", "all_counts", "brand_counts", "cat1_counts", 
                   "cat2_counts", "cat3_counts"]
        data[num_cols]  = MaxAbsScaler().fit_transform(data[num_cols])
            
        data["brand_cat"] = data["brand_name"].astype(str) + ' ' + data["category_name"].astype(str)
        data["brand_cat"] = LabelEncoder().fit_transform(data["brand_cat"])
        
        data['item_desc2gram'] = parallelize_dataframe(data["item_description"], get_2grams)
        
        print("Name to sequences")
        wb_name = wordbatch.WordBatch(normalizeString, n_words=50000)
        wb_name.fit(data["name"])
        
        seq_name = WordSeq(wb_name, {"seq_maxlen": 7,  "seq_truncstart":True, "remove_oovs":True})
        seq_name_desc = WordSeq(wb_name, {"seq_maxlen": 20,  "seq_truncstart":False, "remove_oovs":True})
        data["item_name"] = list(zip(seq_name_desc.transform(wb_name.transform(data["item_description"].astype(str)))))
        data["name"] = list(zip(seq_name.transform(wb_name.transform(data["name"].astype(str)))))
        del wb_name, seq_name, seq_name_desc
        
        print("Desc to sequences")
        wb_desc = wordbatch.WordBatch(normalizeString, n_words=50000, extractor=(WordSeq, {"seq_maxlen": 70,
                                                                                           "seq_truncstart":False,
                                                                                           "remove_oovs":True
                                                                            } ))
        #wb_desc.fit(data["item_description"].astype(str))
        data["item_description"] = list(zip(wb_desc.fit_transform(data["item_description"].astype(str))))
        del wb_desc
        
        print("Desc 2gram to sequences")
        wb_desc2 = wordbatch.WordBatch(normalizeString, n_words=20000, extractor=(WordSeq, {"seq_maxlen": 20,
                                                                                            "seq_truncstart":False,
                                                                                            "remove_oovs":True
                                                                            } ))
        #wb_desc2.fit(data["item_desc2gram"].astype(str))
        #data["item_desc2gram"] = list(zip(wb_desc2.fit_transform(data["item_desc2gram"].astype(str))))
        del wb_desc2
        
        print("split train test")
        train_data = data.loc[: train_rows - 1, :].reset_index(drop=True)
        train_data = train_data.loc[(train_data.price >= 3) & (train_data.price <= 2000), :].reset_index(drop=True)
        test_data  = data.loc[train_rows: , :].reset_index(drop=True)
        
        del train_data['test_id']
        del test_data['train_id']
        del data 
        test_data['test_id'] = test_data['test_id'].astype(int)
        train_data.to_pickle(os.path.join(out_path, 'train_2.pkl'))
        test_data.to_pickle(os.path.join(out_path, 'test_2.pkl'))
        
        return train_data, test_data


In [5]:
%%time
train, test = read_data('../input/', './')

progress: 100%|██████████| 271987/271987 [00:00<00:00, 729573.98it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 696840.48it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 745604.76it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 702346.95it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 737820.18it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 727172.47it/s]
progress: 100%|██████████| 271986/271986 [00:00<00:00, 749138.08it/s]
progress: 100%|██████████| 271986/271986 [00:00<00:00, 741312.18it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 535222.71it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 551287.43it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 543523.30it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 514410.19it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 554447.46it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 551296.48it/s]
progress: 100%|█████

Getting word/char len features


progress: 100%|██████████| 271987/271987 [00:00<00:00, 408450.22it/s]
progress:  63%|██████▎   | 171773/271987 [00:00<00:00, 426196.38it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 411040.10it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 382811.85it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 399770.59it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 376824.23it/s]
progress: 100%|██████████| 271986/271986 [00:00<00:00, 407980.70it/s]
progress: 100%|██████████| 271986/271986 [00:00<00:00, 423028.50it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 714223.83it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 673474.74it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 705906.33it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 661952.38it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 738175.38it/s]
progress: 100%|██████████| 271987/271987 [00:00<00:00, 745762.68it/s]
progress: 100%|█████

Get iphone features
Label encoding features
Get count features
Getting punct related features
Name to sequences
Normalize text
Normalize text
Extract wordseqs
Normalize text
Extract wordseqs
Desc to sequences
Normalize text
Extract wordseqs
Desc 2gram to sequences
split train test
CPU times: user 1min 56s, sys: 38.9 s, total: 2min 35s
Wall time: 3min 52s


In [6]:
train

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,train_id,cat1,cat2,...,cat1_counts,cat2_counts,cat3_counts,plus_counts,ands_counts,comma_counts,all_counts,brand_cat,item_desc2gram,item_name
0,3,831,3,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([2579, 4682, 5037, 119, 19, 5, 51],)",10.0,1,0.0,6,104,...,0.046362,0.045164,0.001408,0.000000,0.000000,0.000000,0.000000,22240,nodescription descriptionyet,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,3891,88,3,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([0, 0, 0, 5720, 12985, 10463, 1589],)",52.0,0,1.0,2,32,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,31503,keyboardgreat greatcondition conditionworks wo...,"([1058, 1589, 971, 99, 823, 938, 10, 241, 249,..."
2,4590,1279,1,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([0, 0, 0, 0, 0, 0, 193],)",10.0,1,2.0,10,105,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.008264,0.008264,37612,adorabletop tophint hintof oflace lacekey keyh...,"([1343, 18, 71, 167, 22548, 27, 78, 10, 167, 6..."
3,3,505,1,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([0, 0, 0, 0, 113, 1580, 12102],)",35.0,1,3.0,4,57,...,0.046362,0.045164,0.001408,0.000000,0.000000,0.000000,0.000000,21883,newtags tagsleather leatherhorses horsesretail...,"([4, 71, 759, 113, 5256, 2568, 7, 138, 2150, 9..."
4,3,1206,1,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([0, 0, 0, 3471, 41, 910, 139],)",44.0,0,4.0,10,60,...,0.046362,0.045164,0.001408,0.000000,0.000000,0.000000,0.000000,21352,completecertificate certificateof ofauthenticity,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
5,3,1218,3,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([0, 0, 2185, 953, 4289, 7, 49987],)",59.0,0,5.0,10,74,...,0.046362,0.045164,0.001408,0.000000,0.000000,0.024793,0.024793,21365,bananarepublic republicbottoms bottomscandies ...,"([0, 0, 0, 633, 786, 602, 2733, 114, 71, 1426,..."
6,86,1278,3,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([0, 0, 2576, 5632, 21520, 12741, 18],)",64.0,0,6.0,10,100,...,0.000000,0.003136,0.000681,0.000000,0.000000,0.016529,0.016529,49308,sizesmall smallstraps strapsslightly slightlys...,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 38, 3024, 2885..."
7,4343,910,3,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([66, 1940, 10, 22655, 3, 27, 37],)",6.0,1,7.0,8,4,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,35337,getthree threepairs pairsof ofsophie sophieche...,"([525, 2349, 995, 304, 27, 5101, 1940, 21, 5, ..."
8,3339,910,3,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([0, 0, 0, 66, 9, 169, 21],)",19.0,0,8.0,8,4,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,26545,girlssize sizesmall smallplus plusgreen greent...,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 66, 5, 38, ..."
9,3,1047,3,"([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","([0, 3217, 5205, 208, 9236, 56, 1768],)",8.0,0,9.0,9,31,...,0.046362,0.045164,0.001408,0.000000,0.000000,0.008264,0.008264,21176,realizedpants pantson onbackwards backwardspic...,"([503, 2901, 56, 2224, 59, 2512, 29, 1393, 101..."


In [7]:
#import spacy
#nlp = spacy.load("en")

#%%time
#for doc in nlp.pipe(train["name"], n_threads=4, batch_size=10000):
#    print(list(doc.noun_chunks))

#%%time
#wb_name = wordbatch.WordBatch(normalizeString, n_words=50000)
#wb_name.fit(train["name"])

#wb_name.dictionary.get('this')

#data = pd.concat([train, test], ignore_index=True)

#(data["brand_name"].astype(str) + data["category_name"].astype(str)).nunique()

#%%time
#seq_name = WordSeq(wb_name, {"seq_maxlen": 20,  "seq_truncstart":False, "remove_oovs":True})
#print(seq_name.transform(wb_name.transform(train["item_description"].fillna("missing").astype(str)))[:15])

#test = pd.read_table(os.path.join('../input/', 'test.tsv'))
#test.head()

#train.loc[train.price > 1500]

#wb_name.dft

In [8]:
class ToTensor():
    """Convert numpy arrays to tensors"""
    def __call__(self, sample):
        
        name = sample['name']
        text_description = sample['text_description']
        brand_name = sample['brand_name']
        category = sample['category']
        item_condition_id = sample['item_condition_id']
        shipping_flag = sample['shipping_flag']
        cat1 = sample['cat1']
        cat2 = sample['cat2']
        cat3 = sample['cat3']
        target = sample['target']
        
        return {
                'name': torch.from_numpy(np.asarray(name).astype(int)), 
                'text_description': torch.from_numpy(np.asarray(text_description).astype(int)), 
                'brand_name': torch.from_numpy(np.asarray(brand_name).astype(int)).long().view(-1), 
                'category': torch.from_numpy(np.asarray(category).astype(int)).long().view(-1),
                'item_condition_id': torch.from_numpy(np.asarray(item_condition_id).astype(int)).long().view(-1), 
                'shipping_flag': torch.from_numpy(np.asarray(shipping_flag).astype(int)).type(torch.FloatTensor).view(-1),
                'cat1': torch.from_numpy(np.asarray(cat1).astype(int)).long().view(-1),
                'cat2': torch.from_numpy(np.asarray(cat2).astype(int)).long().view(-1),
                'cat3': torch.from_numpy(np.asarray(cat3).astype(int)).long().view(-1),
                'target': torch.log1p(torch.from_numpy(np.asarray(target))).type(torch.FloatTensor).view(-1)
               }

class MercariDataset(Dataset):
    """Mercari item price prediction dataset"""
    def __init__(self, df, transform=None):
        """
        Args:
            df = dataframe with required columns
            train_file : file name for training data
            test_file: file name for test data
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.data = df
        self.transform = transform
            
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        name = [self.data['name'].iloc[idx]]
        #print(item_name.shape)
        text_description = [self.data['item_description'].iloc[idx]]
        brand_name = [self.data['brand_name'].iloc[idx]]
        category = [self.data['category_name'].iloc[idx]]
        cat1 = [self.data['cat1'].iloc[idx]]
        cat2 = [self.data['cat2'].iloc[idx]]
        cat3 = [self.data['cat3'].iloc[idx]]
        item_condition_id = [self.data['item_condition_id'].iloc[idx]]
        shipping_flag = [self.data['shipping'].iloc[idx]]
        target = [self.data['price'].iloc[idx]]
        
        sample = {'name':name,
                 'text_description': text_description,
                 'brand_name': brand_name,
                 'category': category,
                 'item_condition_id': item_condition_id,
                 'shipping_flag': shipping_flag,
                 'cat1': cat1,
                 'cat2': cat2,
                 'cat3': cat3,
                 'target': target}
        #print(sample.shape)
        if self.transform:
            sample = self.transform(sample)
        
        return sample
    

# Some Useful Time functions
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [9]:
class MercariNet(nn.Module):
    def __init__(self, input_sizes):
        super(MercariNet, self).__init__()
        
        self.nameEmbedding = nn.EmbeddingBag(input_sizes[0][0], input_sizes[0][1], mode='mean')
        self.textEmbedding = nn.EmbeddingBag(input_sizes[1][0], input_sizes[1][1], mode='mean')
        self.brandEmbedding = nn.Embedding(input_sizes[2][0], input_sizes[2][1])
        self.categoryEmbedding = nn.Embedding(input_sizes[3][0], input_sizes[3][1])
        self.conditionEmbedding = nn.Embedding(input_sizes[4][0], input_sizes[4][1])
        self.cat1Embedding = nn.Embedding(input_sizes[5][0], input_sizes[5][1])
        self.cat2Embedding = nn.Embedding(input_sizes[6][0], input_sizes[6][1])
        self.cat3Embedding = nn.Embedding(input_sizes[7][0], input_sizes[7][1])
        
        all_dims_sum = sum([dim[1] for dim in input_sizes]) + 1
        self.bn1 = nn.BatchNorm1d(all_dims_sum, momentum=0.01)
        self.fc1 = nn.Linear(all_dims_sum, 150)
        self.relu = nn.SELU()
        self.bn2 = nn.BatchNorm1d(150, momentum=0.01)
        self.fc2 = nn.Linear(150, 1)
        
    def forward(self, inputs):
        item_name = inputs['name']
        text_description = inputs['text_description']
        brand_name = inputs['brand_name']
        category = inputs['category']
        item_condition_id = inputs['item_condition_id']
        shipping_flag = inputs['shipping_flag']
        cat1 = inputs['cat1']
        cat2 = inputs['cat2']
        cat3 = inputs['cat3']
        
        batch_size = item_name.size()[0]
        #print(item_name.size())
        nameEmbeds = self.nameEmbedding(item_name.view(batch_size, -1)).view(batch_size, 1, -1)
        #nameEmbeds = torch.mean(nameEmbeds, dim=1).view(batch_size, 1, -1)
        #print(nameEmbeds.size())
        textEmbeds = self.textEmbedding(text_description.view(batch_size, -1)).view(batch_size, 1, -1)
        #textEmbeds = torch.mean(textEmbeds, dim=1).view(batch_size, 1, -1)
        #print(textEmbeds.size())
        brandEmbeds = self.brandEmbedding(brand_name).view(batch_size, 1, -1)
        
        categoryEmbeds = self.categoryEmbedding(category).view(batch_size, 1, -1)
        
        conditionEmbeds = self.conditionEmbedding(item_condition_id).view(batch_size, 1, -1)
        
        cat1Embeds = self.cat1Embedding(cat1).view(batch_size, 1, -1)
        cat2Embeds = self.cat2Embedding(cat2).view(batch_size, 1, -1)
        cat3Embeds = self.cat3Embedding(cat3).view(batch_size, 1, -1)
        
        shipping_flag = shipping_flag.view(batch_size, 1, -1)
        #print(conditionEmbeds.size())
        #print(shipping_flag.size())
        #Concat all embeddings and input
        combined = torch.cat((nameEmbeds, textEmbeds, brandEmbeds, categoryEmbeds, 
                              conditionEmbeds,  shipping_flag, cat1Embeds, cat2Embeds, cat3Embeds), dim=2)
        combined = combined.view(batch_size, -1)
        x = self.bn1(combined)
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.relu(self.fc1(x))
        x = self.bn2(x)
        x = F.dropout(x, p=0.02, training=self.training)
        output = self.fc2(x)
        #x = F.leaky_relu(x)
        #output = F.dropout(x, p=0.05, training=self.training)
        return output
        

In [10]:
# Training model function that uses the dataloader to load the data by Batch
def train_model(model, criterion, optimizer, num_epochs=5, print_every = 100):
    start = time.time()

    best_acc = 0.0
    print_loss_total = 0  # Reset every print_every

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                #scheduler.step()
                model.train(True)  # Set model to training mode
            else:
                model.train(False)  # Set model to evaluate mode

            running_loss = 0.0
            num_batches = dataset_sizes[phase]/BATCH_SIZE
            #running_corrects = 0

            # Iterate over data.
            for i_batch, sample_batched in enumerate(mercari_dataloaders[phase]): 
            # get the inputs
                inputs = {k: Variable(v) for k,v in sample_batched.items() if k != 'target'}
                #inputs = {'name':Variable(sample_batched['name']), 
                #          'item_description':Variable(sample_batched['item_desc']), \
                #    'brand_name':Variable(sample_batched['brand_name']), \
                #    'cat_name':Variable(sample_batched['cat_name']), \
                #    'general_category':Variable(sample_batched['general_category']), \
                #    'subcat1_category':Variable(sample_batched['subcat1_category']), \
                #    'subcat2_category':Variable(sample_batched['subcat2_category']), \
                #    'item_condition':Variable(sample_batched['item_condition']), \
                #    'shipping':Variable(sample_batched['shipping'].float())}
                #print(inputs)
                prices = Variable(sample_batched['target'])   
                batch_size = len(sample_batched['shipping_flag'])   
                

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                outputs = model(inputs)
                #_, preds = torch.max(outputs.data, 1)
                loss = criterion(outputs, prices)

                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                # statistics
                running_loss += loss.data[0]
                print_loss_total += loss.data[0]
                #running_corrects += torch.sum(preds == labels.data)
                
                
                if (i_batch+1) % print_every == 0:
                    print_loss_avg = print_loss_total / print_every
                    print_loss_total = 0
                    #print (i_batch / num_batches, i_batch, num_batches)
                    print('%s (%d %d%%) %.4f' % (timeSince(start, i_batch / num_batches), \
                                                 i_batch, i_batch / num_batches*100, print_loss_avg))
                
                # I have put this just so that the Kernel will run and allow me to publish
                #if (i_batch) > 500:
                #    break

            epoch_loss = running_loss / num_batches
            #epoch_acc = running_corrects / dataset_sizes[phase]

            print('{} Loss: {:.4f}'.format(phase, epoch_loss))
            
        print()

    time_elapsed = time.time() - start
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    # load best model weights
    #model.load_state_dict(best_model_wts)
    return model
    
def predict(model, test_loader):
    preds = []
    for batch in test_loader:
        inputs = {k: Variable(v) for k,v in batch.items() if k != 'target'}
        #prices = Variable(batch['target'])
        outputs = model(inputs).data.numpy()
        preds.extend(outputs)
    return preds
    
    


In [11]:
BATCH_SIZE = 2048

dtrain, dvalid = train_test_split(train, test_size=0.05)
print("loading datasets")
mercari_datasets = {
                    'train': MercariDataset(dtrain,transform=transforms.Compose([ToTensor()])), 
                    'val': MercariDataset(dvalid,transform=transforms.Compose([ToTensor()]))
                   }
dataset_sizes = {
           x: len(mercari_datasets[x]) for x in ['train', 'val']
                }

mercari_dataloaders = {
           'train': torch.utils.data.DataLoader(mercari_datasets['train'], batch_size=BATCH_SIZE, shuffle=True, num_workers=4),
           'val': torch.utils.data.DataLoader(mercari_datasets['val'], batch_size=BATCH_SIZE, shuffle=True, num_workers=1)                                           
                }

loading datasets


In [12]:
mrnet = MercariNet([(50001, 50), (50001, 50), (6000, 50), (1600, 40), (6, 6), (16,6), (121, 10), (900, 20)])
print(mrnet)

MercariNet(
  (nameEmbedding): EmbeddingBag(50001, 50, mode=mean)
  (textEmbedding): EmbeddingBag(50001, 50, mode=mean)
  (brandEmbedding): Embedding(6000, 50)
  (categoryEmbedding): Embedding(1600, 40)
  (conditionEmbedding): Embedding(6, 6)
  (cat1Embedding): Embedding(16, 6)
  (cat2Embedding): Embedding(121, 10)
  (cat3Embedding): Embedding(900, 20)
  (bn1): BatchNorm1d(233, eps=1e-05, momentum=0.01, affine=True)
  (fc1): Linear(in_features=233, out_features=150)
  (relu): SELU
  (bn2): BatchNorm1d(150, eps=1e-05, momentum=0.01, affine=True)
  (fc2): Linear(in_features=150, out_features=1)
)


In [13]:
#torch.from_numpy(np.asarray(train['brand_name']).astype(int))

In [14]:
optimizer = optim.Adam(mrnet.parameters(), lr=0.005, weight_decay=0.001)
criterion = nn.MSELoss()
train_model(mrnet,criterion,optimizer, 5)

Epoch 0/4
----------
0m 16s (- 1m 38s) (99 14%) 1.9173
0m 35s (- 1m 27s) (199 28%) 0.4078
0m 50s (- 1m 5s) (299 43%) 0.3713
1m 5s (- 0m 47s) (399 58%) 0.3423
1m 21s (- 0m 30s) (499 72%) 0.3181
1m 36s (- 0m 14s) (599 87%) 0.3011
train Loss: 0.5688
val Loss: 0.3844

Epoch 1/4
----------
2m 23s (- 14m 14s) (99 14%) 0.6641
2m 43s (- 6m 40s) (199 28%) 0.2684
2m 58s (- 3m 51s) (299 43%) 0.2649
3m 13s (- 2m 19s) (399 58%) 0.2609
3m 28s (- 1m 18s) (499 72%) 0.2596
3m 43s (- 0m 32s) (599 87%) 0.2602
train Loss: 0.2638
val Loss: 0.2528

Epoch 2/4
----------
4m 31s (- 26m 54s) (99 14%) 0.5753
4m 50s (- 11m 53s) (199 28%) 0.2591
5m 5s (- 6m 37s) (299 43%) 0.2565
5m 21s (- 3m 51s) (399 58%) 0.2558
5m 36s (- 2m 6s) (499 72%) 0.2569
5m 50s (- 0m 51s) (599 87%) 0.2542
train Loss: 0.2569
val Loss: 0.2521

Epoch 3/4
----------
6m 39s (- 39m 32s) (99 14%) 0.5721
6m 57s (- 17m 5s) (199 28%) 0.2566
7m 13s (- 9m 22s) (299 43%) 0.2562
7m 28s (- 5m 23s) (399 58%) 0.2557
7m 43s (- 2m 54s) (499 72%) 0.2545
7m 5

MercariNet(
  (nameEmbedding): EmbeddingBag(50001, 50, mode=mean)
  (textEmbedding): EmbeddingBag(50001, 50, mode=mean)
  (brandEmbedding): Embedding(6000, 50)
  (categoryEmbedding): Embedding(1600, 40)
  (conditionEmbedding): Embedding(6, 6)
  (cat1Embedding): Embedding(16, 6)
  (cat2Embedding): Embedding(121, 10)
  (cat3Embedding): Embedding(900, 20)
  (bn1): BatchNorm1d(233, eps=1e-05, momentum=0.01, affine=True)
  (fc1): Linear(in_features=233, out_features=150)
  (relu): SELU
  (bn2): BatchNorm1d(150, eps=1e-05, momentum=0.01, affine=True)
  (fc2): Linear(in_features=150, out_features=1)
)

In [15]:
#del existing data loaders
del mercari_datasets
del mercari_dataloaders

In [16]:
del train

In [7]:



del train

#Make test predictions
test = pd.read_pickle("test_2.pkl")
test_dataset =  MercariDataset(test,transform=transforms.Compose([ToTensor()]))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_preds = predict(mrnet, test_loader)

print("Write out submission")
submission: pd.DataFrame = test[['test_id']]
submission['price'] = np.expm1(test_preds)
submission.price = submission.price.clip(1, 2000)
submission.to_csv("embedding_nn_v3.csv", index=False)


Preparing data
loading datasets
MercariNet(
  (nameEmbedding): EmbeddingBag(100000, 100, mode=mean)
  (textEmbedding): EmbeddingBag(100000, 100, mode=mean)
  (brandEmbedding): Embedding(6000, 30)
  (categoryEmbedding): Embedding(1600, 20)
  (conditionEmbedding): Embedding(5, 3)
  (cat1Embedding): Embedding(15, 4)
  (cat2Embedding): Embedding(120, 10)
  (cat3Embedding): Embedding(900, 20)
  (bn1): BatchNorm1d(288, eps=1e-05, momentum=0.1, affine=True)
  (fc1): Linear(in_features=288, out_features=150)
  (bn2): BatchNorm1d(150, eps=1e-05, momentum=0.1, affine=True)
  (fc2): Linear(in_features=150, out_features=1)
)
Epoch 0/7
----------


RuntimeError: index out of range at /opt/conda/conda-bld/pytorch_1512387374934/work/torch/lib/TH/generic/THTensorMath.c:277

In [7]:
import wordbatch
from wordbatch import WordBatch
from wordbatch.extractors import WordSeq
import pandas as pd
import re

In [8]:
train = pd.read_table("../input/train.tsv")

In [9]:
# Define helpers for text normalization
from nltk.corpus import stopwords
stopwords = {x: 1 for x in stopwords.words('english')}
non_alphanums = re.compile(u'[^A-Za-z0-9]+')
def normalize_text(text):
    return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \
         if len(x) > 1 and x not in stopwords])

In [15]:
wb = wordbatch.WordBatch(normalize_text, n_words=100000, extractor=(WordSeq, {"seq_maxlen": 70,
                                                                            "seq_truncstart":False,
                                                                             } ))

In [16]:
%%time
#wb.dictionary_freeze= True
X_name = wb.fit_transform(train['item_description'].astype(str).fillna("missing"))
#del wb

Normalize text
Extract wordseqs
CPU times: user 10.9 s, sys: 5.98 s, total: 16.9 s
Wall time: 19.7 s

Wall time: 19.7 s


In [17]:
X_name

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  22,
  26],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3105,
  10,
  4,
  111,
  15,
  705,
  24,
  2579,
  477,
  218,
  465,
  1183,
  6784,
  837,
  14104,
  44623,
  1214,
  1164],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [12]:
normalize_text("I am .a dic78 789")

'dic78 789'

In [92]:
train.name.loc[160]

"New Xlg woman's sexy summer halter top"