In [1]:
# Programming by Mojtaba Valipour @ SUTest-V1.0.0, vpcom.ir
# Copyright 2019
# Title: Deep Hierarchical Persian Text Classification based on hdlTex

# Information about the environments
# Environment: hdlTex, vpcomDesk -> hdlTex.yml
# Anaconda
# Python:3.5.6
# Tensorflow: 1.10.0
# Keras: 2.2.2
# Pandas: 0.23.4
# nltk: 3.3.0
# numpy: 1.15.2
# Cuda:9.0

# github.com/mvpcom/ddCh2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Dataset Visualization

In [4]:
# https://research.cafebazaar.ir/visage/divar_datasets/
dataset = pd.read_csv("./data/divar_posts_dataset.csv")
dataset.shape

(947635, 17)

In [6]:
# dataset[dataset['cat3'].str.contains("art")==True] # search query

In [7]:
dataset.iloc[1:5,[3,4,5,6,8,13,14]]

Unnamed: 0,cat1,cat2,cat3,city,desc,price,title
1,for-the-home,furniture-and-home-decore,antiques-and-art,Mashhad,"سلام,یک عدد گلدون نخل سه طبقه ی سالم دارم با پ...",30000,گلدون مصنوعی نخل
2,vehicles,cars,heavy,Mashhad,سریال 43j$NUMبدون شکستگی سه حلقه لاستیک نو یک ...,-1,لودر کاتر پیلار 950
3,for-the-home,furniture-and-home-decore,sofa-armchair,Tehran,مبل راحتی هفت نفره شامل سه نفره یک عدد دونفره ...,600000,مبل راحتی هفت نفره بامیز جلو مبلی
4,personal,baby-and-toys,personal-toys,Karaj,شارژی کنترلی سویچ حمل تا 35 کیلو صندلی برای دو...,450000,ماشین شارژی


In [8]:
print(dataset.cat1.unique(), len(dataset.cat1.unique()))
print(dataset.cat2.unique(), len(dataset.cat2.unique()))
print(dataset.cat3.unique(), len(dataset.cat3.unique()))

['for-the-home' 'vehicles' 'personal' 'electronic-devices' 'businesses'
 'leisure-hobbies'] 6
['furniture-and-home-decore' 'cars' 'baby-and-toys' 'parts-accessories'
 'utensils-and-appliances' 'clothing-and-shoes' 'mobile-tablet'
 'childrens-clothing-and-shoe' 'game-consoles-and-video-games'
 'audio-video' 'building-and-garden' 'jewelry-and-watches'
 'equipments-and-machinery' 'bicycle' 'animals' nan 'batch'
 'musical-instruments' 'health-beauty' 'motorcycles' 'computers'
 'sport-leisure' 'book-student-literature' 'utility' 'travel-packages'
 'hobby-collectibles' 'leisure-hobbies-toys' 'phone'] 28
['sofa-armchair' 'antiques-and-art' 'heavy' 'personal-toys' nan
 'cookware-tableware' 'light' 'clothing' 'mobile-phones' 'tv-projector'
 'garden-and-patio' 'watches' 'offices' 'kitchen' 'farm-animals' 'cat'
 'video-dvdplayer' 'shoes-belt-bag' 'industrial' 'tv-and-stereo-furniture'
 'birds' 'guitar-bass-amplifier' 'beds-bedroom' 'carpets'
 'mobile-tablet-accessories' 'fridge-and-freezer' 'ligh

In [9]:
# cat1 informations
cat1Classes = dataset.cat1.value_counts()
print(cat1Classes)
numClassesCat1 = cat1Classes.count()
print('Num of classes, Cat1: ', numClassesCat1)

for-the-home          290313
vehicles              206260
electronic-devices    166507
personal              139164
leisure-hobbies        83513
businesses             61878
Name: cat1, dtype: int64
Num of classes, Cat1:  6


In [10]:
# cat1 informations
print(dataset.cat2.unique())
cat2Classes = dataset.cat2.value_counts()
print(cat2Classes)
numClassesCat2 = cat2Classes.count()
print('Num of classes, Cat2: ', numClassesCat2)

['furniture-and-home-decore' 'cars' 'baby-and-toys' 'parts-accessories'
 'utensils-and-appliances' 'clothing-and-shoes' 'mobile-tablet'
 'childrens-clothing-and-shoe' 'game-consoles-and-video-games'
 'audio-video' 'building-and-garden' 'jewelry-and-watches'
 'equipments-and-machinery' 'bicycle' 'animals' nan 'batch'
 'musical-instruments' 'health-beauty' 'motorcycles' 'computers'
 'sport-leisure' 'book-student-literature' 'utility' 'travel-packages'
 'hobby-collectibles' 'leisure-hobbies-toys' 'phone']
furniture-and-home-decore        204445
cars                             130443
clothing-and-shoes                87096
mobile-tablet                     76307
utensils-and-appliances           58714
parts-accessories                 55986
animals                           50694
equipments-and-machinery          50101
game-consoles-and-video-games     31964
audio-video                       29176
computers                         26536
baby-and-toys                     20502
motorcycles 

In [11]:
# cat1 informations
cat3Classes = dataset.cat3.value_counts()
print(cat3Classes)
numClassesCat3 = cat3Classes.count()
print('Num of classes, Cat3: ', numClassesCat3)

light                            120451
mobile-phones                     62176
clothing                          53758
sofa-armchair                     49363
birds                             38511
shoes-belt-bag                    33338
tables-and-chairs                 30742
antiques-and-art                  29057
storage                           27702
cookware-tableware                24433
carpets                           22472
industrial                        19985
shop-and-cash                     16660
beds-bedroom                      16650
tv-projector                      16118
tv-and-stereo-furniture           13688
fridge-and-freezer                11152
stove-and-heating                 11128
childrens-furniture               10798
parts-and-accessories             10179
heavy                              9992
mobile-tablet-accessories          9790
training                           7720
instrument-cleaning-tailoring      7387
lighting                           7298


In [12]:
# Hierarchical
catClassesB = dataset.groupby(['cat1']).cat2.value_counts()
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(catClassesB)

cat1                cat2                         
businesses          equipments-and-machinery          50101
                    batch                             11777
electronic-devices  mobile-tablet                     76307
                    game-consoles-and-video-games     31964
                    audio-video                       29176
                    computers                         26536
                    phone                              1193
for-the-home        furniture-and-home-decore        204445
                    utensils-and-appliances           58714
                    building-and-garden               18915
                    utility                            8239
leisure-hobbies     animals                           50694
                    bicycle                           11283
                    sport-leisure                     10374
                    musical-instruments                3854
                    book-student-literature       

In [13]:
# Hierarchical
catClasses = dataset.groupby(['cat1','cat2']).cat3.value_counts()
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(catClasses)

cat1                cat2                       cat3                         
businesses          equipments-and-machinery   industrial                        19985
                                               shop-and-cash                     16660
                                               offices                            5148
                                               cafe-and-restaurant                4670
                                               barbershop-and-beautysalon         3117
electronic-devices  audio-video                tv-projector                      16118
                                               stereo-surround                    5726
                                               camera-camcoders                   5147
                                               mp3-player                          908
                                               video-dvdplayer                     856
                    computers                  parts-

In [14]:
# cat1Num 
# Hint: This will fix the NAN problem -> (NAN : -1)
dataset['cat1'] = dataset['cat1'].astype('category')
dataset['cat2'] = dataset['cat2'].astype('category')
dataset['cat3'] = dataset['cat3'].astype('category')

In [15]:
dataset.iloc[1:5,[3,4,5,6,8,13,14]]

Unnamed: 0,cat1,cat2,cat3,city,desc,price,title
1,for-the-home,furniture-and-home-decore,antiques-and-art,Mashhad,"سلام,یک عدد گلدون نخل سه طبقه ی سالم دارم با پ...",30000,گلدون مصنوعی نخل
2,vehicles,cars,heavy,Mashhad,سریال 43j$NUMبدون شکستگی سه حلقه لاستیک نو یک ...,-1,لودر کاتر پیلار 950
3,for-the-home,furniture-and-home-decore,sofa-armchair,Tehran,مبل راحتی هفت نفره شامل سه نفره یک عدد دونفره ...,600000,مبل راحتی هفت نفره بامیز جلو مبلی
4,personal,baby-and-toys,personal-toys,Karaj,شارژی کنترلی سویچ حمل تا 35 کیلو صندلی برای دو...,450000,ماشین شارژی


In [16]:
dataset[dataset.cat2.isnull()].iloc[1:15,[3,4,5,6,8,13,14]]

Unnamed: 0,cat1,cat2,cat3,city,desc,price,title
99,personal,,,Tehran,جنس چتر از ساتنه سایزش بزرگه\nدسته چتر با بدنه...,15000,چتر ساتن بنفش
164,personal,,,Tehran,رنگ آبی اکبند دانشجویی,20000,خودکار یوروپن اصل
619,personal,,,Tehran,گوی اعصاب کاملا تمیز و نو خیلی کم استفاده شده,40000,گوی اعصاب
710,electronic-devices,,,Tehran,ترموستات 900\nدر جعبه نو نو,-1,ترموستات 900
1391,electronic-devices,,,Tehran,کولر:موتورژن -3500,-1,دینام کولروماشین لباسشویی
1432,electronic-devices,,,Tehran,وارد کننده دوربینهای اوکای ویژن زیر قیمت واردات,60000,دوربین فروش کلی وجزعی
2213,electronic-devices,,,Tehran,لطفا قبل از تماس گرفتن اس ام اس بدهید,-1,یو پی اس سینوسی online
3610,electronic-devices,,,Mashhad,کیفیت 1mp\nدیددرشب20m,49000,دوربین دید درشب AHD
3858,personal,,,Tehran,مارک فابل کستل \nبا جعبه چوبی و فلزی \nاصل المان,120000,مداد رنگی حرفه ای ٢٥ رنگ پلی کروم
4557,electronic-devices,,,Mashhad,دو عدد ای سی 100w واقعی و تا 150 هم میرسه به ش...,25900,ای سی 100w امپلی فایر


In [19]:
cat1Classes = dataset['cat1'].cat.categories
cat2Classes = dataset['cat2'].cat.categories
cat3Classes = dataset['cat3'].cat.categories
dataset['Y1'] = dataset['cat1'].cat.codes
dataset['Y2'] = dataset['cat2'].cat.codes
dataset['Y3'] = dataset['cat3'].cat.codes

In [20]:
print(dataset.Y3.unique())

[50  1 26 42 -1 16 33 14 36 61 24 64 40 31 21 11 62 48 28 60  6 25  5 10
 37 23 34 55 54 38 30 51  8 59 52 53  3 18 41  7 35 63 20 13 43 17 49 32
 46  4 39 57 56 27 29 22  0  9 58 12 44 47 15  2 45 65 19]


In [21]:
newDataset = dataset.copy()

In [22]:
# save categories for later usage
cat1Classes = newDataset['cat1'].cat.categories
cat2Classes = newDataset['cat2'].cat.categories
cat3Classes = newDataset['cat3'].cat.categories
import json
with open('categoriesDivar.json', 'w', encoding='utf8') as outfile:
    json.dump({'cat1':cat1Classes.values.tolist(),'cat2':cat2Classes.values.tolist(),'cat3':cat3Classes.values.tolist()}, outfile, ensure_ascii=False)

In [23]:
newDataset['Y1'] = newDataset['cat1'].cat.codes
newDataset['Y2'] = newDataset['cat2'].cat.codes
newDataset['Y3'] = newDataset['cat3'].cat.codes

In [24]:
newDataset.iloc[1:5,[3,17,4,18,5,19,6,8,13,14]]

Unnamed: 0,cat1,Y1,cat2,Y2,cat3,Y3,city,desc,price,title
1,for-the-home,2,furniture-and-home-decore,12,antiques-and-art,1,Mashhad,"سلام,یک عدد گلدون نخل سه طبقه ی سالم دارم با پ...",30000,گلدون مصنوعی نخل
2,vehicles,5,cars,7,heavy,26,Mashhad,سریال 43j$NUMبدون شکستگی سه حلقه لاستیک نو یک ...,-1,لودر کاتر پیلار 950
3,for-the-home,2,furniture-and-home-decore,12,sofa-armchair,50,Tehran,مبل راحتی هفت نفره شامل سه نفره یک عدد دونفره ...,600000,مبل راحتی هفت نفره بامیز جلو مبلی
4,personal,4,baby-and-toys,2,personal-toys,42,Karaj,شارژی کنترلی سویچ حمل تا 35 کیلو صندلی برای دو...,450000,ماشین شارژی


In [25]:
#TODO: max, avg and min sequence length (title and desc)
#TODO: Build a dictionary, unique words
newDataset['descLength'] = newDataset['desc'].apply(len)
newDataset['titleLength'] = newDataset['title'].apply(len)

In [26]:
print('Desc = Mean:',newDataset['descLength'].mean(), 'Min:',newDataset['descLength'].min(), 'Max:',newDataset['descLength'].max())
print('Title = Mean:',newDataset['titleLength'].mean(), 'Min:',newDataset['titleLength'].min(), 'Max:',newDataset['titleLength'].max())

Desc = Mean: 76.45529238578145 Min: 4 Max: 995
Title = Mean: 17.69796071272167 Min: 3 Max: 50


In [27]:
#newDataset['context'] = newDataset.title + ' ' + newDataset.desc + ' ' + newDataset.desc + ' ' + newDataset.city + ' ' + newDataset.price.astype(str)
#TODO: Check differenet combinations of hacks
newDataset['context'] = newDataset.title + ' ' + newDataset.desc + ' ' + newDataset.city + ' ' + newDataset.price.astype(str)

In [28]:
newDataset.context = newDataset.context.str.replace('\n',' ')

In [32]:
# Helper funcitons
import re
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", " ", string)
    string = re.sub(r"\'", " ", string)
    string = re.sub(r"\"", " ", string)
    string = re.sub(r"/", " ", string)
    string = re.sub(r"//", " ", string)
    string = re.sub(r"$NUM", " ", string)
    #string = re.sub(r'[^\w\s]', '', string, re.UNICODE)
    string = re.sub(r'([a-z])\1+', r'\1', string, re.UNICODE)
    return string.strip().lower()

def text_cleaner(text):
    text = text.replace(":", " ")
    text = text.replace(";", " ")
    text = text.replace(".", " ")
    text = text.replace("&", " ")
    text = text.replace("%", " ")
    text = text.replace("$", " ")
    text = text.replace("#", " ")
    text = text.replace("%", " ")
    text = text.replace("@", " ")
    text = text.replace("!", " ")
    text = text.replace("+", " ")
    text = text.replace("-", " ")
    text = text.replace("_", " ")
    text = text.replace("[", " ")
    text = text.replace(",", " ")
    text = text.replace("،", " ")
    text = text.replace("]", " ")
    text = text.replace("(", " ")
    text = text.replace(")", " ")
    text = text.replace("{", " ")
    text = text.replace("}", " ")
    text = text.replace("\"", "")
    text = text.replace("-", " ")
    text = text.replace("=", " ")
    text = text.replace("~", " ")
    text = text.replace("<", " ")
    text = text.replace(">", " ")
    text = text.replace("«", " ")
    text = text.replace("*", " ")
    text = text.replace("❌", " ")
    text = text.replace("⚽", " ")
    text = text.replace("✅", " ")
    text = text.replace("⌛", " ")
    text = text.replace("⑤", " ")
    text = text.replace("•", " ")
    text = text.replace("♧", " ")
    text = text.replace("num", " ")
    text = text.replace(u'\u2013','')    
    rules = [
        {r'>\s+': u'>'},  # remove spaces after a tag opens or closes
        {r'\s+': u' '},  # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
        {r'^\s+': u''}  # remove spaces at the beginning
    ]
    for rule in rules:
        for (k, v) in rule.items():
            regex = re.compile(k)
            text = regex.sub(v, text)
        text = text.rstrip()
        text = text.strip()
    text.lower().replace("num", " ")
    text = re.sub(r'-?\d+\.?\d*', ' ', text)
    text = re.sub(u'\u200c',' ', text)
    text = re.sub(u'\u200e',' ', text)
    text = re.sub(u'\xad',' ', text)
    return text

In [33]:
sample = newDataset.context[282]
price = sample.split()[-1]
sample = clean_str(sample)
sample = text_cleaner(sample)
sample = sample + price
sample = re.split(r'([a-zA-Z]+)', sample)
sample = " ".join(str(item) for item in sample)
sample.split()

['پراید',
 'مدل',
 'پراید',
 'فنی',
 'سالم',
 'یکسال',
 'بیمه',
 'دوگانه',
 'سوز',
 'کارخانه',
 'لاستیک',
 'ماشین',
 'فوق',
 'به',
 'نرخ',
 'دور',
 'رنگ',
 'میباشد',
 'شماره',
 'تماس',
 'kermanshah',
 '8900000']

In [34]:
# build a dictionary
wordDict = {}
for idx,row in enumerate(newDataset.context):
    price = row.split()[-1]
    row = clean_str(row)
    row = text_cleaner(row)
    row = row + price
    row = re.split(r'([a-zA-Z]+)', row)
    row = " ".join(str(item) for item in row)
    words = row.split()
    for wrd in words:
        if wrd in wordDict:
            wordDict[wrd] += 1
        else:
            wordDict[wrd] = 1
    #if idx > 5000:
    #    break

In [35]:
len(wordDict)

256492

In [36]:
# save dictionary to file
import json
with open('wordDict.json', 'w', encoding='utf8') as outfile:
    json.dump(wordDict, outfile, ensure_ascii=False)

In [37]:
# load dictionary
with open("wordDict.json", "r") as read_file:
    wordDict = json.load(read_file)

In [38]:
def sanitize(x):
    price = x.split()[-1]
    x = clean_str(x)
    x = text_cleaner(x)
    x = x + price
    x = re.split(r'([a-zA-Z]+)', x)
    x = " ".join(str(item) for item in x)
    return x

In [39]:
newDataset['contextProcessed'] = newDataset.context.apply(lambda row: sanitize(row))

In [40]:
newDataset.iloc[11:15,[22,23]]

Unnamed: 0,context,contextProcessed
11,نوکیا6303 سلام.یه گوشیه6303سالم که فقط دوتا خط...,نوکیا سلام یه گوشیه سالم که فقط دوتا خط کوچیک...
12,لباس های دخترانه 2تا9ساله لباس های دخترانه از3...,لباس های دخترانه تا ساله لباس های دخترانه از ...
13,کمک فنر روغنی کمک فنر روغنی تعمیری Tehran 50000,کمک فنر روغنی کمک فنر روغنی تعمیری tehran 5...
14,کلش اف کلنز لول ۳۳ تاون حال لول ۶دیوارها لول ۴...,کلش اف کلنز لول تاون حال لول دیوارها لول ...


In [42]:
# export dataset to text files
import os
outputPath = './dataset/'

if not os.path.exists(outputPath):
    os.makedirs(outputPath)
    
for c in newDataset.columns:
    newDataset[c].to_csv(outputPath + c + '.txt', index=False)

In [43]:
# free memory
import gc
#del dataset, datasetNOW
gc.collect()

61

In [44]:
import pandas as pd

In [45]:
pathDataset = './dataset'
fname = os.path.join(pathDataset,"contextProcessed.txt")
fnamek = os.path.join(pathDataset,"Y1.txt")
fnameL2 = os.path.join(pathDataset,"Y2.txt")
fnameL3 = os.path.join(pathDataset,"Y3.txt")

In [46]:
content = pd.read_table(fname, header=None)
content = content[0].apply(str.strip)

contentk = pd.read_table(fnamek, header=None).values

contentL2 = pd.read_table(fnameL2, header=None).values

In [47]:
# (X_train, y_train, X_test, y_test, content_L2_Train, L2_Train, content_L2_Test, L2_Test, number_of_classes_L2,word_index,embeddings_index,number_of_classes_L1)

In [48]:
Label = contentk
Label_L2 = contentL2

number_of_classes_L1 = np.max(Label)+1 #number of classes in Level 1
number_of_classes_L2 = np.zeros(number_of_classes_L1,dtype=int)

np.random.seed(7)

Label = np.column_stack((Label, Label_L2))

 #number of classes in Level 2 that is 1D array with size of (number of classes in level one,1)

In [50]:
from keras.preprocessing.text import Tokenizer

MAX_NB_WORDS = 55000
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(content)
sequences = tokenizer.texts_to_sequences(content)
word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Found 256364 unique tokens.


In [52]:
from keras.preprocessing.sequence import pad_sequences
MAX_SEQUENCE_LENGTH = 500
content = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [53]:
indices = np.arange(content.shape[0])
np.random.shuffle(indices)
content = content[indices]
Label = Label[indices]
print(content.shape)

(947635, 500)


In [56]:
from sklearn.model_selection import train_test_split, cross_val_score

#X_train, X_test, y_train, y_test = train_test_split(content, Label, test_size=0.2, random_state=0)
X_train, X_test, y_train, y_test  = train_test_split(content, Label, test_size=0.5,random_state= 0)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0)

In [57]:
print(X_train.shape, X_val.shape, X_test.shape)

(473817, 500) (236909, 500) (236909, 500)


In [67]:
content = pd.read_table(fname, header=None)
content = content[0].apply(str.strip)
contentk = pd.read_table(fnamek, header=None).values
contentL2 = pd.read_table(fnameL2, header=None).values
contentL3 = pd.read_table(fnameL3, header=None).values

Label = contentk
Label_L2 = contentL2
Label_L3 = contentL3

np.random.seed(7)
Label = np.column_stack((Label, Label_L2, Label_L3))
LabelDF = pd.DataFrame(Label)    

labelsL1 = LabelDF[0].unique()
labelsL2 = LabelDF[1].unique()
labelsL3 = LabelDF[2].unique()

number_of_classes_L1 = len(labelsL1) #number of classes in Level 1
number_of_classes_L2 = len(labelsL2)
number_of_classes_L3 = len(labelsL3)

classes_L1 = np.zeros((number_of_classes_L1,))
classes_L2 = np.zeros((number_of_classes_L2,))
classes_L3 = np.zeros((number_of_classes_L3,))

print(classes_L2.shape)

(28,)


In [68]:
tmp = pd.DataFrame(y_val)
len(tmp[tmp[0]==0].values)

15422

In [69]:
number_of_classes_L2

28

In [70]:
LabelDF = pd.DataFrame(Label)
labelsL2 = []
labelsL2.append(LabelDF[LabelDF[0]==1][1].unique())
len(labelsL2[0])

6

In [71]:
labelsL1 = LabelDF[0].unique()

In [73]:
labelsL2 = []
for idx in range(number_of_classes_L1):
    print(idx,labelsL1[idx])
    LabelDF = pd.DataFrame(Label)    
    labelsL2.append(LabelDF[LabelDF[0]==labelsL1[idx]][1].unique())
    classes_L2[idx] = len(labelsL2[idx])

0 2
1 5
2 4
3 1
4 0
5 3


In [74]:
labelsL2

[array([12, 25,  6, 26]),
 array([ 7, 21, 19]),
 array([ 2,  9,  8, 16, 14, -1]),
 array([18, 13,  1, -1, 10, 22]),
 array([11,  3]),
 array([ 4,  0, 20, 23,  5, 24, 15, 17])]

In [75]:
embedder = 'fastTextEn'
embeddings_index = {}
'''
For CNN and RNN, we used the text vector-space models using $100$ dimensions as described in Glove. A vector-space model is a mathematical mapping of the word space
'''
if embedder == 'glove':
    Glove_path = os.path.join(GLOVE_DIR, 'glove.6B.300d.txt')
    print(Glove_path)
    f = open(Glove_path, encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            print("Warnning"+str(values)+" in" + str(line))
        embeddings_index[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_index))
elif embedder == 'fastTextEn':
    fastTextDir = './fastText/'
    fastText_path = os.path.join(fastTextDir, 'cc.fa.300.vec')
    print(fastText_path)
    embeddings_index = {}
    with open(fastText_path, encoding='utf8') as infile:
        #for idx,line in enumerate(infile):
        for line in infile:
            #if idx > 1: # skip the first line
            values = line.split()
            word = values[0]
            try:
                coefs = np.asarray(values[1:], dtype='float32')
            except:
                print("Warnning"+str(values)+" in" + str(line))
            if word in wordDict: # need only embedding for words that are in corpus
                embeddings_index[word] = coefs
    #f = open(fastText_path, encoding="utf8")
    #fLines = f.readlines()
    #firstLine = fLines.pop(0) # remove the first line
    #f.close()
    gc.collect()
    print('Total %s word vectors.' % len(embeddings_index))

./fastText/cc.fa.300.vec
Total 91286 word vectors.


In [77]:
# convert Keras model to layer based models
EMBEDDING_DIM = 300 #embedding dimension you can change it to {25, 100, 150, and 300} but need to change glove version

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [80]:
from keras.layers import Dense, Input, Flatten
input1 = Input((MAX_SEQUENCE_LENGTH,))

In [82]:
from keras.layers import Conv1D, MaxPooling1D, Embedding, concatenate, Dropout, LSTM, GRU, Bidirectional,SimpleRNN
layerM1Embedding = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)(input1)

In [83]:
layerM1 = GRU(100,dropout=0.2, recurrent_dropout=0.2)(layerM1Embedding)

In [84]:
layerM1

<tf.Tensor 'gru_1/TensorArrayReadV3:0' shape=(?, 100) dtype=float32>

In [85]:
input2 = Input((1,)) # price

In [86]:
layerM2 = Dense(100, activation='relu')(input2)

In [87]:
layerM2

<tf.Tensor 'dense_1/Relu:0' shape=(?, 100) dtype=float32>

In [88]:
from keras.layers import Add
layer = Add()([layerM1,layerM2])

In [89]:
layer

<tf.Tensor 'add_1/add:0' shape=(?, 100) dtype=float32>

In [92]:
nClasses = 3
from keras.models import Model
out = Dense(nClasses, activation='softmax')(layer)
model = Model(inputs=[input1,input2], outputs=out)

In [93]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 500, 300)     76909500    input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
gru_1 (GRU)                     (None, 100)          120300      embedding_1[0][0]                
__________________________________________________________________________________________________
dense_1 (D

In [94]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# Persian Embedding (FastText)

#### You can ingore all the previous parts or use the main python code (dataDaysChallenge_BIGNet.py)
#### This file is only for your reference. some parts are not compatible with the latest changes
#### You need to prepare files like wordDict.json and all the preprocessed files

# Start: Prepare Dataset for the Model

In [2]:
# load dictionary
import json
with open("wordDict.json", "r") as read_file:
    wordDict = json.load(read_file)

In [3]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

import re
import numpy as np
from keras.models import Sequential
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [5]:
# Helper funcitons

def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", " ", string)
    string = re.sub(r"\'", " ", string)
    string = re.sub(r"\"", " ", string)
    string = re.sub(r"/", " ", string)
    string = re.sub(r"//", " ", string)
    string = re.sub(r"$NUM", " ", string)
    #string = re.sub(r'[^\w\s]', '', string, re.UNICODE)
    string = re.sub(r'([a-z])\1+', r'\1', string, re.UNICODE)
    return string.strip().lower()

def text_cleaner(text):
    text = text.replace(":", " ")
    text = text.replace(";", " ")
    text = text.replace(".", " ")
    text = text.replace("&", " ")
    text = text.replace("%", " ")
    text = text.replace("$", " ")
    text = text.replace("#", " ")
    text = text.replace("%", " ")
    text = text.replace("@", " ")
    text = text.replace("!", " ")
    text = text.replace("+", " ")
    text = text.replace("-", " ")
    text = text.replace("_", " ")
    text = text.replace("[", " ")
    text = text.replace(",", " ")
    text = text.replace("،", " ")
    text = text.replace("]", " ")
    text = text.replace("(", " ")
    text = text.replace(")", " ")
    text = text.replace("{", " ")
    text = text.replace("}", " ")
    text = text.replace("\"", "")
    text = text.replace("-", " ")
    text = text.replace("=", " ")
    text = text.replace("~", " ")
    text = text.replace("<", " ")
    text = text.replace(">", " ")
    text = text.replace("«", " ")
    text = text.replace("*", " ")
    text = text.replace("❌", " ")
    text = text.replace("⚽", " ")
    text = text.replace("✅", " ")
    text = text.replace("⌛", " ")
    text = text.replace("⑤", " ")
    text = text.replace("•", " ")
    text = text.replace("♧", " ")
    text = text.replace("num", " ")
    text = text.replace(u'\u2013','')    
    rules = [
        {r'>\s+': u'>'},  # remove spaces after a tag opens or closes
        {r'\s+': u' '},  # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
        {r'^\s+': u''}  # remove spaces at the beginning
    ]
    for rule in rules:
        for (k, v) in rule.items():
            regex = re.compile(k)
            text = regex.sub(v, text)
        text = text.rstrip()
        text = text.strip()
    text.lower().replace("num", " ")
    text = re.sub(r'-?\d+\.?\d*', ' ', text)
    text = re.sub(u'\u200c',' ', text)
    text = re.sub(u'\u200e',' ', text)
    text = re.sub(u'\xad',' ', text)
    return text

In [6]:
import gc
def loadData_Tokenizer_Efficient(MAX_NB_WORDS,MAX_SEQUENCE_LENGTH, EMBEDDING_DIM = 100, embedder = 'fastTextEn'):

    pathDataset = './dataset'
    fname = os.path.join(pathDataset,"contextProcessed.txt")
    fname2 = os.path.join(pathDataset,"price.txt")
    fnamek = os.path.join(pathDataset,"Y1.txt")
    fnameL2 = os.path.join(pathDataset,"Y2.txt")
    fnameL3 = os.path.join(pathDataset,"Y3.txt")

    content = pd.read_table(fname, header=None)
    content = content[0].apply(str.strip)
    content2 = pd.read_table(fname2, header=None, dtype='int64') # read price as integer

    contentk = pd.read_table(fnamek, header=None).values
    contentL2 = pd.read_table(fnameL2, header=None).values
    contentL3 = pd.read_table(fnameL3, header=None).values

    Label_L1 = contentk
    Label_L2 = contentL2
    Label_L3 = contentL3

    np.random.seed(7)
    Label = np.column_stack((Label_L1, Label_L2, Label_L3))
    LabelDF = pd.DataFrame(Label)    

    labelsL1 = LabelDF[0].unique()
    labelsL2 = LabelDF[1].unique()
    labelsL3 = LabelDF[2].unique()

    number_of_classes_L1 = len(labelsL1) #number of classes in Level 1
    number_of_classes_L2 = len(labelsL2)
    number_of_classes_L3 = len(labelsL3)    
    
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(content)
    sequences = tokenizer.texts_to_sequences(content)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    content = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    indices = np.arange(content.shape[0])
    np.random.shuffle(indices)
    content = content[indices]
    Label = Label[indices]
    print(content.shape)

    # join two inputs 
    content = np.concatenate((content,content2),axis=1)

    #TODO: Balance dataset
    X_train, X_test, y_train, y_test  = train_test_split(content, Label, test_size=0.3,random_state= 0, stratify=Label, shuffle=True)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0, stratify=y_train, shuffle=True)
    print(X_train.shape, X_val.shape, X_test.shape)

    embeddings_index = {}
    '''
    For CNN and RNN, we used the text vector-space models using $100$ dimensions as described in Glove. A vector-space model is a mathematical mapping of the word space
    '''
    if embedder == 'glove':
        Glove_path = os.path.join(GLOVE_DIR, 'glove.6B.300d.txt')
        print(Glove_path)
        f = open(Glove_path, encoding="utf8")
        for line in f:
            values = line.split()
            word = values[0]
            try:
                coefs = np.asarray(values[1:], dtype='float32')
            except:
                print("Warnning"+str(values)+" in" + str(line))
            embeddings_index[word] = coefs
        f.close()
        print('Total %s word vectors.' % len(embeddings_index))
    elif embedder == 'fastTextEn':
        fastTextDir = './fastText/'
        embedderName = 'cc.fa.' + str(EMBEDDING_DIM) + '.vec'
        fastText_path = os.path.join(fastTextDir, embedderName)
        print(fastText_path)
        embeddings_index = {}
        with open(fastText_path, encoding='utf8') as infile:
            #for idx,line in enumerate(infile):
            for line in infile:
                #if idx > 1: # skip the first line
                values = line.split()
                word = values[0]
                try:
                    coefs = np.asarray(values[1:], dtype='float32')
                except:
                    print("Warnning"+str(values)+" in" + str(line))
                if word in wordDict: # need only embedding for words that are in corpus
                    embeddings_index[word] = coefs
        gc.collect()
        print('Total %s word vectors.' % len(embeddings_index))
    return (tokenizer,LabelDF,X_train,y_train,X_val,y_val,X_test,y_test,labelsL1,labelsL2,labelsL3,number_of_classes_L1,number_of_classes_L2,number_of_classes_L3,word_index,embeddings_index)

# Model

In [7]:
from keras.models import Sequential
from keras.models import Model
import numpy as np
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, concatenate, Dropout, LSTM, GRU, Bidirectional,SimpleRNN

In [8]:
'''
buildModel_DNN(nFeatures, nClasses, nLayers=3,Numberof_NOde=100, dropout=0.5)
Build Deep neural networks Model for text classification
Shape is input feature space
nClasses is number of classes
nLayers is number of hidden Layer
Number_Node is number of unit in each hidden layer
dropout is dropout value for solving overfitting problem
'''
def buildModel_DNN(Shape, nClasses, nLayers=3,Number_Node=100, dropout=0.5):
    model = Sequential()
    model.add(Dense(Number_Node, input_dim=Shape))
    model.add(Dropout(dropout))
    for i in range(0,nLayers):
        model.add(Dense(Number_Node, activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(nClasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='RMSprop',
                  metrics=['accuracy'])

    return model

In [9]:
from keras.layers import multiply
def attention_3d_block(inputs):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(TIME_STEPS, activation='softmax')(a)
    if SINGLE_ATTENTION_VECTOR:
        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    output_attention_mul = multiply([inputs, a_probs], name='attention_mul')
    return output_attention_mul

In [10]:
'''
def buildModel_RNN(word_index, embeddings_index, nClasses, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM):
word_index in word index , 
embeddings_index is embeddings index, 
nClasses is number of classes, 
MAX_SEQUENCE_LENGTH is maximum lentgh of text sequences, 
EMBEDDING_DIM is an int value for dimention of word embedding 
'''
import keras.backend as K
from keras.layers import Concatenate
from attention_utils import get_activations, get_data_recurrent
def buildModel_RNN(word_index, embeddings_index, nClasses, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, type=0):
    model = Sequential()
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    '''
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))
    if type==0:
        model.add(GRU(100,dropout=0.2, recurrent_dropout=0.2))
    elif type==1:
        model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(LSTM(200,dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(nClasses, activation='softmax'))
    '''
    input1 = Input((MAX_SEQUENCE_LENGTH,),name='context')
    layerM1Embedding = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)(input1)
    input2 = Input((1,),name='price') # price
    if type==0:
        layer = GRU(100,dropout=0.2, recurrent_dropout=0.2)(layerM1Embedding)
    elif type==1:
        layer = GRU(100,dropout=0.2, recurrent_dropout=0.2)(layerM1Embedding)
        layer = Conv1D(filters=64, kernel_size=3, padding='same', activation='relu')(layer)
        layer = MaxPooling1D(pool_size=2)(layer)
        layer = LSTM(200,dropout=0.2, recurrent_dropout=0.2)(layer)
    elif type==2:
        layerM1 = GRU(100,dropout=0.2, recurrent_dropout=0.2)(layerM1Embedding)
        layerM1 = Dense(nClasses, activation='softmax')(layerM1)
        layerM2 = Dense(nClasses, activation='softmax')(input2)
        layer = Concatenate()([layerM1,layerM2])     
    elif type==3:
        # attention
        attentionMul = attention_3d_block(layerM1Embedding)
        layer = GRU(100,dropout=0.2, recurrent_dropout=0.2)(attentionMul)
    out = Dense(nClasses, activation='softmax')(layer)
    model = Model(inputs=[input1,input2], outputs=out)
    model.summary()
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    return model

In [11]:
'''
def buildModel_CNN(word_index,embeddings_index,nClasses,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,Complexity=0):
word_index in word index , 
embeddings_index is embeddings index,
nClasses is number of classes, 
MAX_SEQUENCE_LENGTH is maximum length of text sequences, 
EMBEDDING_DIM is an int value for dimention of word embedding, 
Complexity we have two different CNN model as follows 
Complexity=0 is simple CNN with 3 hidden layer
Complexity=2 is more complex model of CNN with filter_length of [3, 4, 5, 6, 7]
'''
def buildModel_CNN(word_index,embeddings_index,nClasses,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,Complexity=1):
    if Complexity==0:
        embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
        for word, i in word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        embedding_layer = Embedding(len(word_index) + 1,
                                    EMBEDDING_DIM,
                                    weights=[embedding_matrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    trainable=True)
        sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,))
        embedded_sequences = embedding_layer(sequence_input)

        x = Conv1D(256, 5, activation='relu')(embedded_sequences)
        x = MaxPooling1D(5)(x)
        x = Conv1D(256, 5, activation='relu')(x)
        x = MaxPooling1D(5)(x)
        x = Conv1D(256, 5, activation='relu')(x)
        x = MaxPooling1D(35)(x)  # global max pooling
        x = Flatten()(x)
        x = Dense(256, activation='relu')(x)
        preds = Dense(nClasses, activation='softmax')(x)

        model = Model(sequence_input, preds)
        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['acc'])
    else:
        embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
        for word, i in word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector

        embedding_layer = Embedding(len(word_index) + 1,
                                    EMBEDDING_DIM,
                                    weights=[embedding_matrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    trainable=True)

        convs = []
        filter_sizes = [3, 4, 5, 6, 7]

        sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
        embedded_sequences = embedding_layer(sequence_input)

        for fsz in filter_sizes:
            l_conv = Conv1D(128, filter_length=fsz, activation='relu')(embedded_sequences)
            l_pool = MaxPooling1D(5)(l_conv)
            convs.append(l_pool)

        l_merge = concatenate(convs,axis=1) # Merge(mode='concat', concat_axis=1)(convs)
        l_cov1 = Conv1D(128, 5, activation='relu')(l_merge)
        l_pool1 = MaxPooling1D(5)(l_cov1)
        l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
        l_pool2 = MaxPooling1D(30)(l_cov2)
        l_flat = Flatten()(l_pool2)
        l_dense = Dense(128, activation='relu')(l_flat)
        preds = Dense(nClasses, activation='softmax')(l_dense)
        model = Model(sequence_input, preds)
        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['acc'])

    return model

# Train

In [12]:
#TODO: balance dataset with weighted loss 
MEMORY_MB_MAX = 1200000 # maximum memory you can use
MAX_SEQUENCE_LENGTH = 100 # Maximum sequance lentgh 500 words
MAX_NB_WORDS = 55000 # Maximum number of unique words
EMBEDDING_DIM = 300 #embedding dimension you can change it to {25, 100, 150, and 300} but need to change glove version
batch_size_L1 = int(3048/2) # batch size in Level 1
batch_size_L2 = int(3048/2) # batch size in Level 2
batch_size_L3 = int(3048/2) # batch size in Level 3
epochs = 10
rnnType = 3

In [13]:
import pandas as pd

#TODO: For now only RNN is working perfectly, need to change others later
L1_model = 2 # 0 is DNN, 1 is CNN, and 2 is RNN for Level 1
L2_model = 2 # 0 is DNN, 1 is CNN, and 2 is RNN for Level 2
L3_model = 2 # 0 is DNN, 1 is CNN, and 2 is RNN for Level 2

np.set_printoptions(threshold=np.inf)
'''
location of input data in two ways 
1: Tokenizer that is using GLOVE or FastText
1: loadData that is using couting words or tf-idf
'''

#X_train, y_train, X_test, y_test, content_L2_Train, L2_Train, content_L2_Test, L2_Test, number_of_classes_L2,word_index, embeddings_index,number_of_classes_L1 =  \
#        loadData_Tokenizer(MAX_NB_WORDS,MAX_SEQUENCE_LENGTH)

#X_train_DNN, y_train_DNN, X_test_DNN, y_test_DNN, content_L2_Train_DNN, L2_Train_DNN, content_L2_Test_DNN, L2_Test_DNN, number_of_classes_L2_DNN, number_of_classes_L1 = loadData()

tokenizer,LabelDF,X_train,y_train,X_val,y_val,X_test,y_test,labelsL1,labelsL2,labelsL3,number_of_classes_L1,number_of_classes_L2,number_of_classes_L3,word_index,embeddings_index = loadData_Tokenizer_Efficient(MAX_NB_WORDS,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM)

print("Loading Data is Done")

Found 256364 unique tokens.
(947635, 100)
(530675, 101) (132669, 101) (284291, 101)
./fastText/cc.fa.300.vec
Total 91286 word vectors.
Loading Data is Done


In [16]:
from sklearn.utils import class_weight
classWeights = class_weight.compute_class_weight('balanced',np.unique(y_train[:,0]),y_train[:,0])
print(classWeights)

[2.55240198 0.94855199 0.54403431 1.8911209  1.13492491 0.76572501]


In [18]:
from keras.layers import Add

In [19]:
gc.collect()

0

In [20]:
#######################RNN Level 1########################
from keras.layers.core import *
from sklearn.utils import class_weight
TIME_STEPS = MAX_SEQUENCE_LENGTH
SINGLE_ATTENTION_VECTOR = False
if L1_model == 2:
    print('Create model of RNN')
    model = buildModel_RNN(word_index, embeddings_index,number_of_classes_L1,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,rnnType)
    classWeights = class_weight.compute_class_weight('balanced',np.unique(y_train[:,0]),y_train[:,0])
    model.fit([X_train[:,:-1],X_train[:,-1]], y_train[:,0],
              validation_data=([X_val[:,:-1],X_val[:,-1]], y_val[:,0]),
              epochs=epochs,
              batch_size=batch_size_L1,
              class_weight=classWeights)

Create model of RNN
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
context (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 300)     76909500    context[0][0]                    
__________________________________________________________________________________________________
permute_1 (Permute)             (None, 300, 100)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 300, 100)     0           permute_1[0][0]                  
_________________________________________________________________________________________

In [21]:
from keras.models import load_model
modelL1Filename = './models/modelL1.h5'

In [24]:
model.save(modelL1Filename)

In [22]:
# load model
model = load_model(modelL1Filename)

In [27]:
y_train[y_train[:,0]==0,:][1:5,:]

array([[ 0, 11, 28],
       [ 0, 11,  7],
       [ 0, 11, 49],
       [ 0, 11, 40]])

In [28]:
labelsL1

array([2, 5, 4, 1, 0, 3])

In [30]:
X_val[y_val[:,0]==0,:].shape

(8663, 100)

In [31]:
X_val[y_val[:,0]==0,:].shape

(8663, 100)

In [32]:
import keras.backend as K
from sklearn.utils import class_weight
HDLTex = [] # Level 2 models is list of Deep Structure
######################RNN Level 2################################
if L2_model == 2:
    for idx in range(0, number_of_classes_L1):
        print('Create Sub model of ', idx)
        classes = LabelDF[LabelDF[0]==idx][1].unique()
        #classesL2.append(classes)
        numberSamples = len(LabelDF[LabelDF[0]==idx])
        print(classes, ' Number of samples: ', numberSamples)
        #HDLTex.append(Sequential()) # memory limitation
        model = Sequential()
        #HDLTex[idx] = buildModel_RNN(word_index, embeddings_index,len(classes),MAX_SEQUENCE_LENGTH,EMBEDDING_DIM)
        model = buildModel_RNN(word_index, embeddings_index,len(classes),MAX_SEQUENCE_LENGTH,EMBEDDING_DIM)
        labelTrain = y_train[y_train[:,0]==idx,1]
        for clsIdx,cls in enumerate(classes):
            labelTrain[labelTrain==cls] = clsIdx
        labelVal = y_val[y_val[:,0]==idx,1]
        for clsIdx,cls in enumerate(classes):
            labelVal[labelVal==cls] = clsIdx
        #HDLTex[idx].fit(X_train[y_train[:,0]==idx,:], labelTrain,
        classWeights = class_weight.compute_class_weight('balanced',np.unique(labelTrain),labelTrain)
        model.fit(X_train[y_train[:,0]==idx,:], labelTrain,
                      validation_data=(X_val[y_val[:,0]==idx,:], labelVal),
                      epochs=epochs,
                      batch_size=batch_size_L2,
                      class_weight = classWeights)
        # save model
        modelL2Filename = './models/modelL2_'+ str(idx)+'.h5'
        #HDLTex[idx].save(modelL2Filename)
        model.save(modelL2Filename)
        del model
        K.clear_session()
        gc.collect()

Create Sub model of  0
[11  3]  Number of samples:  61878
Train on 34652 samples, validate on 8663 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Create Sub model of  1
[18 13  1 -1 10 22]  Number of samples:  166507
Train on 93243 samples, validate on 23313 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Create Sub model of  2
[12 25  6 26]  Number of samples:  290313
Train on 162574 samples, validate on 40643 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Create Sub model of  3
[ 4  0 20 23  5 24 15 17]  Number of samples:  83513
Train on 46769 samples, validate on 11690 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Create Sub model of  4
[ 2  9  8 16 14 -1]  Number of samples:  139164
Train on 77931 

Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [81]:
classesL2 = []
for idx in range(0, number_of_classes_L1):
        classes = LabelDF[LabelDF[0]==idx][1].unique()
        classesL2.append(list(classes))
print(classesL2)        

[[12, 4], [19, 14, 2, 0, 11, 23], [13, 26, 7, 27], [5, 1, 21, 24, 6, 25, 16, 18], [3, 10, 9, 17, 15, 0], [8, 22, 20]]


In [14]:
classesL2 = [[11,3],[18,13,1,-1,10,22],[12,25,6,26],[4,0,20,23,5,24,15,17],[2,9,8,16,14,-1],[7,21,19]],

In [19]:
classes = LabelDF[1].unique()
print(len(classes))
print(number_of_classes_L2)

28
28


In [21]:
LabelDF[LabelDF[1]==0][2].unique()

array([21, 11,  6, 22,  0, 47])

In [164]:
# free memory
from keras import backend as K
K.clear_session()
gc.collect()

760830

In [34]:
classesL3 = []
for idx in range(0, number_of_classes_L2):
        classes = LabelDF[LabelDF[1]==idx][2].unique()
        classesL3.append(list(classes))
print(classesL3)        

[[21, 11, 6, 22, 0, 47], [61, 62, 51, 8, 39, -1], [42, 54, 13, 12], [-1], [-1], [20], [24, 31, 53, 4, -1], [26, 33], [-1], [14, 48], [38, 41, 17, 32, 44], [40, 28, 3, 7, 49, -1], [50, 1, 60, 5, 10, 34, 55, 52, 57, -1], [-1], [-1], [27, 15, -1], [64, 30, 46, -1], [-1], [36, 37, 56], [-1], [25, 43, -1, 58, 19], [-1], [-1], [59, 9, 2, 65], [-1], [16, 23, -1, 18, 35, 63], [-1, 29, 45], []]


In [None]:
# classesL3 = [[21, 11, 6, 22, 0, 47], [61, 62, 51, 8, 39, -1], [42, 54, 13, 12], [-1], [-1], [20], 
# [24, 31, 53, 4, -1], [26, 33], [-1], [14, 48], [38, 41, 17, 32, 44], [40, 28, 3, 7, 49, -1], 
# [50, 1, 60, 5, 10, 34, 55, 52, 57, -1], [-1], [-1], [27, 15, -1], [64, 30, 46, -1], [-1], [36, 37, 56],
# [-1], [25, 43, -1, 58, 19], [-1], [-1], [59, 9, 2, 65], [-1], [16, 23, -1, 18, 35, 63], [-1, 29, 45], []]

In [36]:
epochs = 20
import keras.backend as K
from sklearn.utils import class_weight
HDLTexL3 = [] # Level 3 models is list of Deep Structure
######################RNN Level 3################################
L3_model = 2
if L3_model == 2:
    for idx in range(0, number_of_classes_L2):
        print('Create Sub model of ', idx)
        classes = LabelDF[LabelDF[1]==idx][2].unique()
        if len(classes) < 2:
            continue
        #classesL3.append(list(classes))
        numberSamples = len(LabelDF[LabelDF[1]==idx])
        print(classes, ' Number of samples: ', numberSamples)
        #HDLTexL3.append(Sequential())
        model = Sequential()
        #HDLTexL3[idx] = buildModel_RNN(word_index, embeddings_index,len(classes),MAX_SEQUENCE_LENGTH,EMBEDDING_DIM)
        model = buildModel_RNN(word_index, embeddings_index,len(classes),MAX_SEQUENCE_LENGTH,EMBEDDING_DIM)
        labelTrain = y_train[y_train[:,1]==idx,2]
        for clsIdx,cls in enumerate(classes):
            labelTrain[labelTrain==cls] = clsIdx
        labelVal = y_val[y_val[:,1]==idx,2]
        for clsIdx,cls in enumerate(classes):
            labelVal[labelVal==cls] = clsIdx
        #HDLTexL3[idx].fit(X_train[y_train[:,1]==idx,:], labelTrain,
        classWeights = class_weight.compute_class_weight('balanced',np.unique(labelTrain),labelTrain)
        model.fit(X_train[y_train[:,1]==idx,:], labelTrain,
                      validation_data=(X_val[y_val[:,1]==idx,:], labelVal),
                      epochs=epochs,
                      batch_size=batch_size_L3,
                      class_weight = classWeights)
        # save model
        modelL3Filename = './models/modelL3_'+ str(idx)+'.h5'
        #HDLTexL3[idx].save(modelL3Filename)
        model.save(modelL3Filename)
        del model
        K.clear_session()
        gc.collect()

Create Sub model of  16
[64 30 46 -1]  Number of samples:  10316
Train on 5776 samples, validate on 1445 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Create Sub model of  17
Create Sub model of  18
[36 37 56]  Number of samples:  76307
Train on 42731 samples, validate on 10684 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Create Sub model of  19
Create Sub model of  20
[25 43 -1 58 19]  Number of samples:  3854
Train on 2157 samples, validate on 540 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoc

Epoch 18/20
Epoch 19/20
Epoch 20/20
Create Sub model of  21
Create Sub model of  22
Create Sub model of  23
[59  9  2 65]  Number of samples:  10374
Train on 5809 samples, validate on 1452 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Create Sub model of  24
Create Sub model of  25
[16 23 -1 18 35 63]  Number of samples:  58714
Train on 32879 samples, validate on 8220 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Create Sub model of  26
[-1 29 45]  Number of samples:  8239
Train on 4614 samples, validate on 1153 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Create Sub model of  27


# Time to Test

In [25]:
from keras.models import load_model
index = 120
numberSamples = 5
xSample = X_test[index:index+numberSamples]
ySample = y_test[index:index+numberSamples]
print(xSample.shape, ySample.shape)

(5, 100) (5, 3)


In [26]:
results = -np.ones_like(ySample)
print(results)

[[-1 -1 -1]
 [-1 -1 -1]
 [-1 -1 -1]
 [-1 -1 -1]
 [-1 -1 -1]]


In [110]:
# Level 1 Test
# load model
modelFilename = 'modelL1'+'.h5'
model = load_model(modelFilename)
# test model
yPred = model.predict(xSample)
predL1Class = yPred.argmax(axis=1)
results[:,0] = predL1Class
print(results)
print(ySample)

[[ 2 -1 -1]
 [ 2 -1 -1]
 [ 2 -1 -1]
 [ 4 -1 -1]
 [ 0 -1 -1]]
[[ 2 12 10]
 [ 2 25 63]
 [ 2 12 60]
 [ 4  9 14]
 [ 0 11 40]]


In [112]:
# Level 2 Test
#print(classesL2)
#predL2Class = []
predL1Class = results[:,0]
for idx, smp in enumerate(predL1Class):
    #print(smp)
    if len(classesL2[smp]) < 2:
        # return -1 or the class number
        if classesL2[smp][0]==-1 or classesL2[smp][0] is None:
            #predL2Class.append(-1)
            results[idx,1] = -1
        else:
            #predL2Class.append(classesL2[smp][0])
            results[idx,1] = classesL2[smp][0]
        continue
    # load related model 
    modelFilename = 'modelL2_'+ str(smp)+ '.h5'
    model = load_model(modelFilename)
    yPred = model.predict(xSample[idx:idx+1,:])
    predClass = yPred.argmax()
    #predL2Class.append(classesL2[smp][predClass])
    results[idx,1] = classesL2[smp][predClass]
    print(smp, results[idx,1], end=':', flush=True)
    del model
    K.clear_session()
    gc.collect()
#results[idx,1] = predL2Class

2 12:2 25:2 12:4 9:0 11:

In [113]:
print(results)
print(ySample)

[[ 2 12 -1]
 [ 2 25 -1]
 [ 2 12 -1]
 [ 4  9 -1]
 [ 0 11 -1]]
[[ 2 12 10]
 [ 2 25 63]
 [ 2 12 60]
 [ 4  9 14]
 [ 0 11 40]]


In [114]:
# Level 3 Test
#print(classesL3)
#predL3Class = []
predL2Class = results[:,1]
for idx, smp in enumerate(predL2Class):
    #print(smp)
    if len(classesL3[smp]) < 2:
        # return -1 or the class number
        if classesL3[smp][0]==-1 or classesL3[smp][0] is None:
            #predL3Class.append(-1)
            results[idx,2] = -1
        else:
            #predL3Class.append(classesL3[smp][0])
            results[idx,2] = classesL3[smp][0]
        continue
    # load related model 
    modelFilename = './models/modelL3_'+ str(smp)+ '.h5'
    model = load_model(modelFilename)
    yPred = model.predict(xSample[idx:idx+1,:])
    predClass = yPred.argmax()
    #predL3Class.append(classesL3[smp][predClass])
    results[idx,2] = classesL3[smp][predClass]
    print(smp, results[idx,2], end=':', flush=True)
    del model
    K.clear_session()
    gc.collect()
#results[:,2] = predL2Class

12 10:25 63:12 60:9 14:11 40:

In [115]:
print(results)
print(ySample)

[[ 2 12 10]
 [ 2 25 63]
 [ 2 12 60]
 [ 4  9 14]
 [ 0 11 40]]
[[ 2 12 10]
 [ 2 25 63]
 [ 2 12 60]
 [ 4  9 14]
 [ 0 11 40]]


In [153]:
print(idx,'/',len(predL1Class))

1928 / 236909


In [37]:
import glob
modelExists = glob.glob('./models/*.h5') 
print(modelExists)

['./models/modelL3_20.h5', './models/modelL3_7.h5', './models/modelL3_16.h5', './models/modelL3_2.h5', './models/modelL3_6.h5', './models/modelL2_4.h5', './models/modelL3_9.h5', './models/modelL3_26.h5', './models/modelL3_18.h5', './models/modelL3_25.h5', './models/modelL3_23.h5', './models/modelL3_10.h5', './models/modelL2_5.h5', './models/modelL3_15.h5', './models/modelL1.h5', './models/modelL3_12.h5', './models/modelL2_2.h5', './models/modelL3_1.h5', './models/modelL2_3.h5', './models/modelL3_0.h5', './models/modelL2_1.h5', './models/modelL3_11.h5', './models/modelL2_0.h5']


In [38]:
from tqdm import tqdm_notebook as tqdm
# all test data
#X_test, y_test
x = X_test
y = y_test
print(x.shape,y.shape)
results = -np.ones_like(y)

# Level 1 Test
# load model
modelFilename = './models/modelL1'+'.h5'
model = load_model(modelFilename)
# test model
yPred = model.predict(x, verbose=1, batch_size=256)
predL1Class = yPred.argmax(axis=1)
results[:,0] = predL1Class
print('Level 1 is done')

# TODO: Parallel Computing, batchSize
# Level 2 Test
predL1Class = results[:,0]

(284291, 100) (284291, 3)
Level 1 is done


In [204]:
y[40:50,:]

array([[ 5,  7, 33],
       [ 4,  9, 48],
       [ 5,  7, 33],
       [ 1, 13, -1],
       [ 4, 16, 64],
       [ 2, 12, 60],
       [ 2, 12, 50],
       [ 1, 18, 36],
       [ 5,  7, 33],
       [ 5,  7, 33]])

In [205]:
results[40:50,:]

array([[ 5,  7, 33],
       [ 4,  9, 48],
       [ 5,  7, 33],
       [ 1,  1, -1],
       [ 4, 16, 64],
       [ 2, 12, 60],
       [ 2, 12, 50],
       [ 1, 18, 36],
       [ 5,  7, 33],
       [ 5,  7, 33]])

In [39]:
from tqdm import tqdm_notebook as tqdm
# all test data
#X_test, y_test
x = X_test
y = y_test
print(x.shape,y.shape)
results = -np.ones_like(y)

# Level 1 Test
# load model
modelFilename = './models/modelL1'+'.h5'
model = load_model(modelFilename)
# test model
yPred = model.predict(x, verbose=1, batch_size=2048)
predL1Class = yPred.argmax(axis=1)
results[:,0] = predL1Class
print('Level 1 is done')

# Level 2 Test
predL1Class = results[:,0]
for cls in np.unique(predL1Class):
    indexes = predL1Class==cls
    print('Selected Indices for class ',cls,': ',len(indexes[indexes]),'/',len(indexes))
    # load related model 
    modelFilename = './models/modelL2_'+ str(cls)+ '.h5'
    if modelFilename in modelExists:
        model = load_model(modelFilename)  
        yPred = model.predict(x[indexes,:], verbose=1, batch_size=2048)
        predClasses = yPred.argmax(axis=1)
        for idx, value in enumerate(classesL2[cls]):
            predClasses[predClasses==idx] = value
        results[indexes,1] = predClasses
        del model
        K.clear_session()
        gc.collect()
    else:
        if len(classesL2[cls]) < 2:
            if classesL2[cls][0]==-1 or classesL2[cls][0] is None:
                results[indexes,1] = -1
            else:
                results[indexes,1] = classesL2[cls][0]
print('Level 2 is done')  

# Level 3 Test
predL2Class = results[:,1]
for cls in np.unique(predL2Class):
    indexes = predL2Class==cls
    print('Selected Indices for class ',cls,': ',len(indexes[indexes]),'/',len(indexes))
    # load related model 
    modelFilename = './models/modelL3_'+ str(cls)+ '.h5'
    if modelFilename in modelExists:
        model = load_model(modelFilename)  
        yPred = model.predict(x[indexes,:], verbose=1, batch_size=2048)
        predClasses = yPred.argmax(axis=1)
        for idx, value in enumerate(classesL3[cls]):
            predClasses[predClasses==idx] = value
        results[indexes,2] = predClasses
        del model
        K.clear_session()
        gc.collect()
    else:
        if cls == -1:
            results[indexes,2] = -1
            continue
        if len(classesL3[cls]) < 2:
            if classesL3[cls][0]==-1 or classesL3[cls][0] is None:
                results[indexes,2] = -1
            else:
                results[indexes,2] = classesL3[cls][0]
print('Level 3 is done')

(284291, 100) (284291, 3)
Level 1 is done
Selected Indices for class  0 :  16350 / 284291
Selected Indices for class  1 :  49927 / 284291
Selected Indices for class  2 :  88507 / 284291
Selected Indices for class  3 :  24650 / 284291
Selected Indices for class  4 :  42747 / 284291
Selected Indices for class  5 :  62110 / 284291
Level 2 is done
Selected Indices for class  -1 :  387 / 284291
Selected Indices for class  0 :  18570 / 284291
Selected Indices for class  1 :  18448 / 284291
Selected Indices for class  3 :  2499 / 284291
Selected Indices for class  6 :  5747 / 284291
Selected Indices for class  7 :  39104 / 284291
Selected Indices for class  8 :  9830 / 284291
Selected Indices for class  9 :  26578 / 284291
Selected Indices for class  10 :  7814 / 284291
Selected Indices for class  11 :  13851 / 284291
Selected Indices for class  12 :  62063 / 284291
Selected Indices for class  14 :  3000 / 284291
Selected Indices for class  15 :  613 / 284291
Selected Indices for class  16 : 

In [40]:
# calculate accuracy
eq = y == results

cat1Acc = len(eq[eq[:,0]==True])/len(eq[:,0])
cat2Acc = len(eq[eq[:,1]==True])/len(eq[:,1])
cat3Acc = len(eq[eq[:,2]==True])/len(eq[:,2])

eqS = eq[eq[:,0]==True,:]
eqS = eqS[eqS[:,1]==True,:]
eqS = eqS[eqS[:,2]==True,:]
totallAcc =  len(eqS)/len(eq)

print(totallAcc,cat1Acc,cat2Acc,cat3Acc)

0.8200224417937958 0.9599846636017321 0.8726445789701398 0.824774614743344


In [None]:
# prevRes = [0.8216361556547028 0.9590982191474363 0.8725037883744391 0.831395177051104]

# Challenge Test

In [41]:
# Load Test data
dataset = pd.read_csv("../data/phase_2_dataset.csv")
dataset.shape 

(200000, 10)

In [42]:
dataset.iloc[1:5,[2,4,8,9]]

Unnamed: 0,city,desc,price,title
1,Tehran,تبلت GALAXY TAB A\r\r\r\n١٠ اینچ ١٦ گیگ فول اچ...,800000,تبلت سامسونگ
2,Karaj,بدلیل جابجایی فروش فوری,200000,میز تلویزیون و نمای دکوری
3,Mashhad,رو دسته های قیچی کارشوده و خیلی خوش دست و تیز ...,90000,قیچی جگوار نو و سالم
4,Tehran,111سالم بی رنگ فنی سالم تخفیف بیمه کامل بیمه ت...,16500000,111سالم نقد و اقساط


In [43]:
newDataset = dataset.copy()

In [44]:
newDataset['context'] = newDataset.title + ' ' + newDataset.desc + ' ' + newDataset.city + ' ' + newDataset.price.astype(str)
#newDataset['context'] = newDataset.title + ' ' + newDataset.desc + ' ' + newDataset.desc + ' ' + newDataset.city + ' ' + newDataset.price.astype(str)

In [45]:
newDataset.iloc[1:5,-1]

1    تبلت سامسونگ تبلت GALAXY TAB A\r\r\r\n١٠ اینچ ...
2    میز تلویزیون و  نمای دکوری بدلیل جابجایی فروش ...
3    قیچی جگوار  نو و سالم رو دسته های قیچی کارشوده...
4    111سالم نقد و اقساط 111سالم بی رنگ فنی سالم تخ...
Name: context, dtype: object

In [46]:
def sanitize(x):
    price = x.split()[-1]
    x = clean_str(x)
    x = text_cleaner(x)
    x = x + price
    x = re.split(r'([a-zA-Z]+)', x)
    x = " ".join(str(item) for item in x)
    return x

In [47]:
newDataset['contextProcessed'] = newDataset.context.apply(lambda row: sanitize(str(row)))

In [48]:
newDataset.iloc[1:5,-1]

1    تبلت سامسونگ تبلت  galaxy   tab   a    اینچ   ...
2    میز تلویزیون و نمای دکوری بدلیل جابجایی فروش ف...
3    قیچی جگوار نو و سالم رو دسته های قیچی کارشوده ...
4     سالم نقد و اقساط  سالم بی رنگ فنی سالم تخفیف ...
Name: contextProcessed, dtype: object

In [49]:
# export dataset to text files
outputPath = './dataChallenge/'

if not os.path.exists(outputPath):
    os.makedirs(outputPath)

for c in newDataset.columns:
    newDataset[c].to_csv(outputPath + c + '.txt', index=False)

In [50]:
fname = os.path.join(outputPath,"contextProcessed.txt")
content = pd.read_table(fname, header=None)
content = content[0].apply(str.strip)
word_index = tokenizer.word_index
print('Utilized %s unique tokens.' % len(word_index))
sequences = tokenizer.texts_to_sequences(content)
content = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

Utilized 256364 unique tokens.


In [51]:
content.shape

(200000, 100)

In [52]:
classesL2

[[11, 3],
 [18, 13, 1, -1, 10, 22],
 [12, 25, 6, 26],
 [4, 0, 20, 23, 5, 24, 15, 17],
 [2, 9, 8, 16, 14, -1],
 [7, 21, 19]]

In [53]:
from tqdm import tqdm_notebook as tqdm
# all test data
#X_test, y_test
x = content
y = np.zeros((x.shape[0],3), dtype=int)
print(x.shape,y.shape)
results = -np.ones_like(y)

# Level 1 Test
# load model
modelFilename = './models/modelL1'+'.h5'
model = load_model(modelFilename)
# test model
yPred = model.predict(x, verbose=1, batch_size=256)
predL1Class = yPred.argmax(axis=1)
results[:,0] = predL1Class
print('Level 1 is done')

# Level 2 Test
predL1Class = results[:,0]
for cls in np.unique(predL1Class):
    indexes = predL1Class==cls
    print('Selected Indices for class ',cls,': ',len(indexes[indexes]),'/',len(indexes))
    # load related model 
    modelFilename = './models/modelL2_'+ str(cls)+ '.h5'
    if modelFilename in modelExists:
        model = load_model(modelFilename)  
        yPred = model.predict(x[indexes,:], verbose=1, batch_size=256)
        predClasses = yPred.argmax(axis=1)
        for idx, value in enumerate(classesL2[cls]):
            predClasses[predClasses==idx] = value
        results[indexes,1] = predClasses
        del model
        K.clear_session()
        gc.collect()
    else:
        if len(classesL2[cls]) < 2:
            if classesL2[cls][0]==-1 or classesL2[cls][0] is None:
                results[indexes,1] = -1
            else:
                results[indexes,1] = classesL2[cls][0]
print('Level 2 is done')  

# Level 3 Test
predL2Class = results[:,1]
for cls in np.unique(predL2Class):
    indexes = predL2Class==cls
    print('Selected Indices for class ',cls,': ',len(indexes[indexes]),'/',len(indexes))
    # load related model 
    modelFilename = './models/modelL3_'+ str(cls)+ '.h5'
    if modelFilename in modelExists:
        model = load_model(modelFilename)  
        yPred = model.predict(x[indexes,:], verbose=1, batch_size=256)
        predClasses = yPred.argmax(axis=1)
        for idx, value in enumerate(classesL3[cls]):
            predClasses[predClasses==idx] = value
        results[indexes,2] = predClasses
        del model
        K.clear_session()
        gc.collect()
    else:
        if cls == -1:
            results[indexes,2] = -1
            continue
        if len(classesL3[cls]) < 2:
            if classesL3[cls][0]==-1 or classesL3[cls][0] is None:
                results[indexes,2] = -1
            else:
                results[indexes,2] = classesL3[cls][0]
print('Level 3 is done')

(200000, 100) (200000, 3)
Level 1 is done
Selected Indices for class  0 :  9257 / 200000
Selected Indices for class  1 :  31433 / 200000
Selected Indices for class  2 :  49766 / 200000
Selected Indices for class  3 :  17061 / 200000
Selected Indices for class  4 :  26370 / 200000
Selected Indices for class  5 :  66113 / 200000
Level 2 is done
Selected Indices for class  -1 :  259 / 200000
Selected Indices for class  0 :  13168 / 200000
Selected Indices for class  1 :  9953 / 200000
Selected Indices for class  3 :  1448 / 200000
Selected Indices for class  6 :  3859 / 200000
Selected Indices for class  7 :  50614 / 200000
Selected Indices for class  8 :  6086 / 200000
Selected Indices for class  9 :  15712 / 200000
Selected Indices for class  10 :  5263 / 200000
Selected Indices for class  11 :  7809 / 200000
Selected Indices for class  12 :  33959 / 200000
Selected Indices for class  14 :  1756 / 200000
Selected Indices for class  15 :  434 / 200000
Selected Indices for class  16 :  27

In [54]:
results[1:10]

array([[ 1, 18, 56],
       [ 2, 12, 60],
       [ 2, 26, 29],
       [ 5,  7, 33],
       [ 3,  0,  6],
       [ 4, 14, -1],
       [ 1,  1, 61],
       [ 2, 12,  1],
       [ 5,  7, 33]])

In [55]:
# load categories
with open("categoriesDivar.json", "r") as read_file:
    catDict = json.load(read_file)

In [56]:
catDict.keys()

dict_keys(['cat1', 'cat2', 'cat3'])

In [57]:
# Export to the file
resultsDF = pd.DataFrame(results)
print(resultsDF.iloc[1:5,:])
resultsDF['cat1'] = resultsDF[0] 
resultsDF['cat2'] = resultsDF[1] 
resultsDF['cat3'] = resultsDF[2] 

   0   1   2
1  1  18  56
2  2  12  60
3  2  26  29
4  5   7  33


In [58]:
resultsDF.iloc[1:5,:]

Unnamed: 0,0,1,2,cat1,cat2,cat3
1,1,18,56,1,18,56
2,2,12,60,2,12,60
3,2,26,29,2,26,29
4,5,7,33,5,7,33


In [59]:
for idx,value in enumerate(catDict['cat1']):
    resultsDF.loc[resultsDF[0]==idx, 'cat1'] = catDict['cat1'][idx]
for idx,value in enumerate(catDict['cat2']):
    resultsDF.loc[resultsDF[1]==idx, 'cat2'] = catDict['cat2'][idx]
for idx,value in enumerate(catDict['cat3']):
    resultsDF.loc[resultsDF[2]==idx, 'cat3'] = catDict['cat3'][idx]
resultsDF.loc[resultsDF.cat1==-1,'cat1'] = ''
resultsDF.loc[resultsDF.cat2==-1,'cat2'] = ''
resultsDF.loc[resultsDF.cat3==-1,'cat3'] = ''

In [60]:
resultsDF.iloc[1:10,:]

Unnamed: 0,0,1,2,cat1,cat2,cat3
1,1,18,56,electronic-devices,mobile-tablet,tablet
2,2,12,60,for-the-home,furniture-and-home-decore,tv-and-stereo-furniture
3,2,26,29,for-the-home,utility,instrument-cleaning-tailoring
4,5,7,33,vehicles,cars,light
5,3,0,6,leisure-hobbies,animals,birds
6,4,14,-1,personal,health-beauty,
7,1,1,61,electronic-devices,audio-video,tv-projector
8,2,12,1,for-the-home,furniture-and-home-decore,antiques-and-art
9,5,7,33,vehicles,cars,light


In [61]:
resultsDF['title'] = dataset['title']
resultsDF['desc'] = dataset['desc']

In [64]:
resultsDF.iloc[200:300,[3,4,5,6,7]]

Unnamed: 0,cat1,cat2,cat3,title,desc
200,vehicles,motorcycles,,ویو مدل 94 انژکتور,هندا ویو انژکتور\r\r\r\nتمیزومرتب سفارشی\r\r\r...
201,vehicles,cars,light,سمندef7دوگانه مدل90در نی ریز,***این خودرو در نی ریز هست***\r\r\r\nخوش قیمت\...
202,vehicles,parts-accessories,,لوازم یدکى پیکان,تعدادى لوازم یدکى پیکان \r\r\r\r\nساخت انگلیس ...
203,vehicles,cars,light,پرایددوگانه,پرایدسفید مدل 84 بیمه تا اخرسال دوگانه دستی ول...
204,vehicles,cars,light,تیپ 2 - سفارشی کارخانه,تیپ 2 - داخل چرم و سینه قهوه ای - بدون رنگ - خ...
205,vehicles,cars,light,پراید81 مدادی دوگانه,رنگ مدادی بدونه ضربه . بیمه 18 همین برج تمام ش...
206,electronic-devices,mobile-tablet,mobile-phones,اپل ۶. گری. ۶۴ گیگ,گوشی در حد آک هست.گوشی سالم به شرط میدم که هرج...
207,for-the-home,furniture-and-home-decore,tv-and-stereo-furniture,میزال سی دی,تولیدوپخش وپذیرش نمایندگی
208,for-the-home,building-and-garden,stove-and-heating,بخاری گازی,هخامنشی بزرگ بسیار سالم
209,electronic-devices,audio-video,camera-camcoders,Ps3اسلیم 250 گیگ,"سلام ps3اسلیم 250 گیگ ابدیت 4/80,با حدود 20 با..."


In [63]:
# export to file
resultsDF.to_csv('./resultsChallenge2.csv',columns=['cat1','cat2','cat3'])