# Load Libraries

In [1]:
# Data Mungling Block
import numpy as np
import pandas as pd
import os.path


# Keras Block
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint


# Scikit Learn Block
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from scipy import sparse


# Data Visulation Block
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
%matplotlib inline 

Using TensorFlow backend.


Couldn't import dot_parser, loading of dot files will not be possible.


# Read Dataset

In [2]:
X_train_cust  = pd.read_csv("Data/Koc_Yaz_Okulu_Data_Train_Cust.txt", sep =";")
X_train_agent = pd.read_csv("Data/Koc_Yaz_Okulu_Data_Train_Agent.txt", sep =";")

X_train_cust["LIST_TXT"]  = X_train_cust["CUST_TXT"].apply(lambda x: x.split(" "))
X_train_agent["LIST_TXT"] = X_train_agent["AGENT_TXT"].apply(lambda x: x.split(" "))

#label
y_train = pd.read_csv("Data/Koc_Yaz_Okulu_Data_Train_Target.txt", sep =";")

X_train = pd.merge(X_train_agent, X_train_cust, how="inner", on=["ID"], suffixes=["_agent", "_cust"])
df      = pd.merge(X_train, y_train, how = "inner", on=["ID"])

# Columns start with "LABEL_"
label_cols = []
for col in df.columns:
    if("LABEL" in col):
        label_cols.append(col)

# Test Data For Submission

In [3]:
test_agent = pd.read_csv("Data/Koc_Yaz_Okulu_Data_Test_Agent.txt", sep =";")
test_cust  = pd.read_csv("Data/Koc_Yaz_Okulu_Data_Test_Cust.txt",  sep =";")

test_cust["LIST_TXT"]  = test_cust["CUST_TXT"].apply(lambda x: x.split(" "))
test_agent["LIST_TXT"] = test_agent["AGENT_TXT"].apply(lambda x: x.split(" "))

test = pd.merge(test_agent, test_cust, how="inner", on=["ID"], suffixes=["_agent", "_cust"])
print(test_agent.shape, test_cust.shape, test.shape)

(12721, 3) (12721, 3) (12721, 5)


# Split Data and create TF-IDF

In [4]:
# Split Train Test
train, valid = train_test_split(df, random_state=21, test_size=0.30, shuffle=True)


# TFIDF
'''

vectorizer = TfidfVectorizer(strip_accents='unicode',
                             analyzer='word',
                             ngram_range=(1,3),
                             norm='l2',
                             lowercase=False,
                             max_df = 0.5)

'''

vectorizer = TfidfVectorizer(ngram_range=(1,2), lowercase=False,
               min_df=3, max_df=0.5, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )


train_text      = train["AGENT_TXT"] + " " + train["CUST_TXT"]
valid_text      = valid["AGENT_TXT"] + " " + valid["CUST_TXT"]
test_cust_text  = test["CUST_TXT"]
test_agent_text = test["AGENT_TXT"]
drop_cols       = ["ID", "AGENT_TXT", "LIST_TXT_agent", "CUST_TXT", "LIST_TXT_cust"]

vectorizer.fit(train_text)

train_term_doc      = vectorizer.transform(train_text)
valid_term_doc      = vectorizer.transform(valid_text)
test_term_agent_doc = vectorizer.transform(test_agent_text)
test_term_cust_doc  = vectorizer.transform(test_cust_text)

y_train = train.drop(labels = drop_cols, axis=1)
y_valid = valid.drop(labels = drop_cols, axis=1)

# Train SVM with TF-IDF Features

In [5]:
# Using pipeline for applying SVM and one vs rest classifier
LogSVM_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LinearSVC(max_iter=3000), n_jobs=-1)),
            ])


data_trains = [train_term_doc]
data_valids = [valid_term_doc]
models      = [LogSVM_pipeline]

submission_cust  = pd.DataFrame()
submission_agent = pd.DataFrame()

for category in label_cols:
    print('**Processing {} ...**'.format(category))
    for data_train, data_valid in zip(data_trains, data_valids):
        for model in models:
            model.fit(data_train, train[category])
            
            # Train Predict
            prediction = model.predict(data_train)
            accuracy   = accuracy_score(train[category], prediction)
            
            print(category)
            print('Train accuracy is {}'.format(accuracy))
            
            # Valid Predict
            prediction = model.predict(data_valid)
            accuracy   = accuracy_score(valid[category], prediction)
            
            print(category)
            print('Valid accuracy is {}'.format(accuracy))
            
            # Test Predict Cust
            prediction                = model.predict(test_term_cust_doc)
            submission_cust[category] = prediction    
            # Test Predict Agent
            prediction                 = model.predict(test_term_agent_doc)
            submission_agent[category] = prediction    
    print("\n")
    
submission_agent.to_csv("Results/agent_tfidf_svm.csv", index=False)
submission_cust.to_csv("Results/cust_tfidf_svm.csv", index=False)

**Processing LABEL_1 ...**
LABEL_1
Train accuracy is 0.9969285714285714
LABEL_1
Valid accuracy is 0.9294166666666667


**Processing LABEL_2 ...**
LABEL_2
Train accuracy is 0.9907142857142858
LABEL_2
Valid accuracy is 0.8525


**Processing LABEL_3 ...**
LABEL_3
Train accuracy is 0.9952142857142857
LABEL_3
Valid accuracy is 0.9229166666666667


**Processing LABEL_4 ...**
LABEL_4
Train accuracy is 0.9965
LABEL_4
Valid accuracy is 0.9444166666666667


**Processing LABEL_5 ...**
LABEL_5
Train accuracy is 0.9977142857142857
LABEL_5
Valid accuracy is 0.9605


**Processing LABEL_6 ...**
LABEL_6
Train accuracy is 0.99975
LABEL_6
Valid accuracy is 0.97025


**Processing LABEL_7 ...**
LABEL_7
Train accuracy is 0.99575
LABEL_7
Valid accuracy is 0.9425


**Processing LABEL_8 ...**
LABEL_8
Train accuracy is 0.9971428571428571
LABEL_8
Valid accuracy is 0.9511666666666667


**Processing LABEL_9 ...**
LABEL_9
Train accuracy is 0.9972142857142857
LABEL_9
Valid accuracy is 0.9451666666666667


**Processi

In [6]:
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self
    


preds_agent = np.zeros((len(test_agent), len(label_cols)))
preds_cust  = np.zeros((len(test_cust),  len(label_cols)))

for i, category in enumerate(label_cols):
    print('fit', category)
    
    modelNbSvm = NbSvmClassifier(C=1, dual=True, n_jobs=-1)
    modelNbSvm.fit(train_term_doc, train[category])
    
     # Train Predict
    prediction = modelNbSvm.predict(train_term_doc)
    accuracy   = accuracy_score(train[category], prediction)
    print('Train accuracy is {}'.format(accuracy))

    # Valid Predict
    prediction = modelNbSvm.predict(valid_term_doc)
    accuracy   = accuracy_score(valid[category], prediction)
    print('Valid accuracy is {}'.format(accuracy))

    # Test Predict Agent
    prediction = modelNbSvm.predict(test_term_agent_doc)
    preds_agent[:,i] = prediction
     # Test Predict Cust
    prediction = modelNbSvm.predict(test_term_cust_doc)
    preds_cust[:,i] = prediction


preds_agent = pd.DataFrame(preds_agent)
preds_agent = preds_agent.add_prefix("LABEL_")
preds_agent.to_csv("Results/agent_tfidf_nbsvm.csv", index=False)

preds_cust = pd.DataFrame(preds_cust)
preds_cust = preds_cust.add_prefix("LABEL_")
preds_cust.to_csv("Results/cust_tfidf_nbsvm.csv", index=False)

fit LABEL_1





'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9437857142857143
Valid accuracy is 0.9280833333333334
fit LABEL_2



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.8764285714285714
Valid accuracy is 0.8640833333333333
fit LABEL_3



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9334285714285714
Valid accuracy is 0.9208333333333333
fit LABEL_4



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9499285714285715
Valid accuracy is 0.9431666666666667
fit LABEL_5



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9690714285714286
Valid accuracy is 0.95875
fit LABEL_6



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9788928571428571
Valid accuracy is 0.9685
fit LABEL_7



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.95125
Valid accuracy is 0.94325
fit LABEL_8



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9635357142857143
Valid accuracy is 0.9485833333333333
fit LABEL_9



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9591785714285714
Valid accuracy is 0.9445833333333333
fit LABEL_10



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9032857142857142
Valid accuracy is 0.8879166666666667
fit LABEL_11



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9878214285714285
Valid accuracy is 0.9765833333333334
fit LABEL_12



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9685357142857143
Valid accuracy is 0.9638333333333333
fit LABEL_13



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9481785714285714
Valid accuracy is 0.9349166666666666
fit LABEL_14



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9508928571428571
Valid accuracy is 0.9424166666666667
fit LABEL_15



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9875
Valid accuracy is 0.9851666666666666
fit LABEL_16



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9917857142857143
Valid accuracy is 0.9841666666666666
fit LABEL_17



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9253571428571429
Valid accuracy is 0.917
fit LABEL_18



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9439285714285715
Valid accuracy is 0.93125
fit LABEL_19



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9843214285714286
Valid accuracy is 0.9698333333333333
fit LABEL_20



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9723928571428572
Valid accuracy is 0.9585
fit LABEL_21



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9906785714285714
Valid accuracy is 0.9799166666666667
fit LABEL_22



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9690357142857143
Valid accuracy is 0.95475
fit LABEL_23



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9851428571428571
Valid accuracy is 0.9784166666666667
fit LABEL_24



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9956428571428572
Valid accuracy is 0.9906666666666667
fit LABEL_25



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9761428571428571
Valid accuracy is 0.9724166666666667
fit LABEL_26



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9810714285714286
Valid accuracy is 0.9723333333333334
fit LABEL_27



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9955714285714286
Valid accuracy is 0.9884166666666667
fit LABEL_28



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9899642857142857
Valid accuracy is 0.9831666666666666
fit LABEL_29



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9950714285714286
Valid accuracy is 0.9941666666666666
fit LABEL_30



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9817857142857143
Valid accuracy is 0.9709166666666667
fit LABEL_31



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9914642857142857
Valid accuracy is 0.9863333333333333
fit LABEL_32



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9824285714285714
Valid accuracy is 0.9731666666666666
fit LABEL_33



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9966785714285714
Valid accuracy is 0.99175
fit LABEL_34



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9939285714285714
Valid accuracy is 0.9871666666666666
fit LABEL_35



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9898214285714285
Valid accuracy is 0.98075
fit LABEL_36



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9836785714285714
Valid accuracy is 0.9780833333333333
fit LABEL_37



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9970357142857142
Valid accuracy is 0.9948333333333333
fit LABEL_38



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.97125
Valid accuracy is 0.9699166666666666
fit LABEL_39



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9932857142857143
Valid accuracy is 0.9858333333333333
fit LABEL_40



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9942142857142857
Valid accuracy is 0.9915
fit LABEL_41



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9946071428571429
Valid accuracy is 0.989
fit LABEL_42



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9978214285714285
Valid accuracy is 0.9946666666666667
fit LABEL_43



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9988571428571429
Valid accuracy is 0.9974166666666666
fit LABEL_44



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9983571428571428
Valid accuracy is 0.99575
fit LABEL_45



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9941785714285715
Valid accuracy is 0.98775
fit LABEL_46



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9992857142857143
Valid accuracy is 0.9960833333333333
fit LABEL_47



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9993214285714286
Valid accuracy is 0.99675
fit LABEL_48



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 1.0
Valid accuracy is 0.9983333333333333
fit LABEL_49



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9995357142857143
Valid accuracy is 0.9969166666666667
fit LABEL_50



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9983928571428572
Valid accuracy is 0.9921666666666666
fit LABEL_51



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 1.0
Valid accuracy is 0.9988333333333334
fit LABEL_52



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9993928571428572
Valid accuracy is 0.999
fit LABEL_53



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9998571428571429
Valid accuracy is 0.9975833333333334
fit LABEL_54



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9997142857142857
Valid accuracy is 0.9971666666666666
fit LABEL_55



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9999642857142857
Valid accuracy is 0.9975833333333334
fit LABEL_56



'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.



Train accuracy is 0.9997142857142857
Valid accuracy is 0.9991666666666666


# Word2Vec and GloVe

In [7]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        print('Loss after epoch {}: {}'.format(self.epoch, loss))
        self.epoch += 1

word2vec_path = "Models/word2vec.model"

if(os.path.exists(word2vec_path)):
    word2vecModel = Word2Vec.load(word2vec_path)
else:
    word2vecModel = Word2Vec(train["LIST_TXT_agent"] + train["LIST_TXT_cust"],
                     size=300,
                     window=10,
                     min_count=5,
                     workers=12,
                     negative=5,
                     iter=20, #epochs
                     sg=1, # skipgram=1   cbow=0
                     compute_loss=True,
                     callbacks=[callback()]) 
    word2vecModel.save(word2vec_path)

In [8]:
# write txt for train GloVe
with open("GloVe-master/glove_data.txt", 'w') as file_handler:
    for item in train_text:
        file_handler.write("{}\n".format(item))   

# Glove Dizinine Gidilip Aşağıdaki Komutların Çalıştırılması Gerek

In [9]:
# cd GloVe-master/
# ./demo.sh
# cd ..

In [10]:
# convert glove to gensim format
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = "GloVe-master/vectors.txt"
tmp_file   = "GloVe-master/glove_word2vec.txt"

_ = glove2word2vec(glove_file, tmp_file)

gloveModel = KeyedVectors.load_word2vec_format(tmp_file)

# Deep Learning Part
# Create Embedding Matrix and Train Models

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, Reshape, Concatenate, Flatten
from keras.layers import GRU, Conv1D
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D

from keras.models import Model
from keras.optimizers import Adam

#  ----> skipgram uncomment
embeddings_index = dict(zip(word2vecModel.wv.index2word, word2vecModel.wv.syn0))
# -----> GloVe uncomment
#embeddings_index = dict(zip(gloveModel.wv.index2word, gloveModel.wv.syn0))



embed_size = 300    # how big is each word vector
max_features = len(embeddings_index) # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100        # max number of words in a comment to use


list_sentences_train      = train_text
list_sentences_valid      = valid_text
list_sentences_test_agent = test_agent_text
list_sentences_test_cust  = test_cust_text


tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))

list_tokenized_train      = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_valid      = tokenizer.texts_to_sequences(list_sentences_valid)
list_tokenized_test_agent = tokenizer.texts_to_sequences(list_sentences_test_agent)
list_tokenized_test_cust  = tokenizer.texts_to_sequences(list_sentences_test_cust)


X_train      = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_valid      = pad_sequences(list_tokenized_valid, maxlen=maxlen)
X_test_agent = pad_sequences(list_tokenized_test_agent,  maxlen=maxlen)
X_test_cust  = pad_sequences(list_tokenized_test_cust,   maxlen=maxlen)



all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()


# Embedding Matrix
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
        
    embedding_vector = embeddings_index.get(word)
    
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
y_train = train[label_cols].values


Call to deprecated `syn0` (Attribute will be removed in 4.0.0, use self.vectors instead).


arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.



# Define Deep Learning Models

In [12]:
def biLSTM():
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(56, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


filter_sizes = [1,2,3,5]
num_filters = 32

def CNN():    
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.4)(x)
    x = Reshape((maxlen, embed_size, 1))(x)
    
    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    
    maxpool_0 = MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1))(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1))(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1))(conv_2)
    maxpool_3 = MaxPool2D(pool_size=(maxlen - filter_sizes[3] + 1, 1))(conv_3)
        
    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])   
    z = Flatten()(z)
    z = Dropout(0.1)(z)
        
    outp = Dense(56, activation="sigmoid")(z)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

def biGRU_CNN():

    sequence_input = Input(shape=(maxlen, ))
    
    x = Embedding(max_features,
                  embed_size,
                  weights=[embedding_matrix],
                  trainable = False)(sequence_input)
    
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
    x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
    
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    
    x = concatenate([avg_pool, max_pool]) 
    
    preds = Dense(56, activation="sigmoid")(x)
    
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=1e-3),
                  metrics=['accuracy'])
    
    return model

In [13]:
models = [biLSTM(), CNN(), biGRU_CNN()]
names  = ["biLSTM", "CNN", "biGRU_CNN"]

for name, model in zip(names, models):
    model.fit(X_train,
              y_train,
              batch_size=256,
              epochs=8,
              validation_split=0.2)
  
    predictions = model.predict([X_test_cust], batch_size=1024, verbose=1)
    
    predictions = pd.DataFrame(predictions)
    predictions = predictions.add_prefix("LABEL_")
    predictions.to_csv("Results/cust_" + name + ".csv", index=False)
    
    
    predictions = model.predict([X_test_agent], batch_size=1024, verbose=1)
    
    predictions = pd.DataFrame(predictions)
    predictions = predictions.add_prefix("LABEL_")
    predictions.to_csv("Results/agent_" + name + ".csv", index=False)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 22400 samples, validate on 5600 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Train on 22400 samples, validate on 5600 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Train on 22400 samples, validate on 5600 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [14]:
predictions = model.predict([X_train], batch_size=1024, verbose=1)
for category in range(predictions.shape[1]):

    prediction = predictions[:, category]
    prediction[prediction > 0.5] = 1
    prediction[prediction < 0.5] = 0
    prediction = prediction.astype(int)
    # Train acc
    accuracy   = accuracy_score(y_train[:,category], prediction)
    print('LABEL_{} Train accuracy is {}'.format(category, accuracy))
    
predictions = model.predict([X_valid], batch_size=1024, verbose=1)
for category in range(predictions.shape[1]):

    prediction = predictions[:, category]
    prediction[prediction > 0.5] = 1
    prediction[prediction < 0.5] = 0
    prediction = prediction.astype(int)
    # Val acc
    accuracy   = accuracy_score(y_valid.iloc[:, category], prediction)
    print('LABEL_{} Valid accuracy is {}'.format(category, accuracy))

LABEL_0 Train accuracy is 0.9132142857142858
LABEL_1 Train accuracy is 0.8383571428571429
LABEL_2 Train accuracy is 0.9030714285714285
LABEL_3 Train accuracy is 0.8939285714285714
LABEL_4 Train accuracy is 0.9429642857142857
LABEL_5 Train accuracy is 0.9519642857142857
LABEL_6 Train accuracy is 0.9306428571428571
LABEL_7 Train accuracy is 0.9430357142857143
LABEL_8 Train accuracy is 0.9233928571428571
LABEL_9 Train accuracy is 0.8798571428571429
LABEL_10 Train accuracy is 0.9652142857142857
LABEL_11 Train accuracy is 0.9470714285714286
LABEL_12 Train accuracy is 0.9258214285714286
LABEL_13 Train accuracy is 0.9276785714285715
LABEL_14 Train accuracy is 0.9693571428571428
LABEL_15 Train accuracy is 0.9786785714285714
LABEL_16 Train accuracy is 0.9049642857142857
LABEL_17 Train accuracy is 0.9263214285714285
LABEL_18 Train accuracy is 0.9632857142857143
LABEL_19 Train accuracy is 0.9471428571428572
LABEL_20 Train accuracy is 0.9730714285714286
LABEL_21 Train accuracy is 0.948571428571428