In [2]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler
from keras.models import load_model
from keras.initializers import glorot_normal, Zeros, Ones
import keras.backend as K
from keras.optimizers import RMSprop
import tensorflow as tf

In [3]:
%%time
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import random

from sklearn import preprocessing
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.decomposition import PCA
from math import sqrt
from scipy import stats
from scipy.stats import norm, skew #for some statistics

Wall time: 453 ms


In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [1]:
import anago

Using TensorFlow backend.


In [5]:
import os
os.listdir()

['.ipynb_checkpoints',
 'Elmo with tensorflow hub.ipynb',
 'innoplexus_NN.ipynb',
 'kernel56398426dd.ipynb',
 'memorization_kernel.ipynb',
 'sample_submission.csv',
 'sample_submission_usrypCc.zip',
 'submission.csv',
 'submission.zip',
 'submission_lgb.csv',
 'submission_lgb.zip',
 'submission_lgb1.csv',
 'submission_lgb1.zip',
 'test.csv',
 'test_XEV14AD.zip',
 'train.csv',
 'train_3PIRKSI.zip',
 'Untitled3.ipynb']

In [6]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
subm = pd.read_csv('sample_submission.csv')

In [7]:
train = train.fillna(method="ffill")
test = test.fillna(method="ffill")

In [8]:
train['Word'] = train['Word'].apply(lambda x:x.lower())
test['Word'] = test['Word'].apply(lambda x:x.lower())

In [9]:
test.head(5)

Unnamed: 0,id,Doc_ID,Sent_ID,Word
0,4543834,30001,191283,cccva
1,4543835,30001,191283,","
2,4543836,30001,191283,manova
3,4543837,30001,191283,","
4,4543838,30001,191283,my


In [10]:
words = list(set(train["Word"].values))
n_words = len(words); n_words

158163

In [11]:
tags = list(set(train["tag"].values))

In [12]:
n_tags = len(tags); n_tags

3

In [13]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
    
    def get_next(self):
        try:
            s = self.data[self.data["Sent_ID"] == self.n_sent]
            self.n_sent += 1
            return s["Word"].values.tolist(), s["tag"].values.tolist()    
        except:
            self.empty = True
            return None, None

In [14]:
class TestSentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 191283
        self.data = data
        self.empty = False
    
    def get_next(self):
        try:
            s = self.data[self.data["Sent_ID"] == self.n_sent]
            self.n_sent += 1
            return s["Word"].values.tolist()   
        except:
            self.empty = True
            return None

In [15]:
getter = SentenceGetter(train)

In [18]:
getter.sentences

AttributeError: 'SentenceGetter' object has no attribute 'sentences'

In [16]:
testgetter = TestSentenceGetter(test)
testgetter.get_next()

['cccva', ',', 'manova', ',', 'my', 'black', 'hen', '.']

In [17]:
sent, tag = getter.get_next()

In [16]:
print(sent)
print(tag)

['Obesity', 'in', 'Low-', 'and', 'Middle-Income', 'Countries', ':', 'Burden', ',', 'Drivers', ',', 'and', 'Emerging', 'Challenges', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


Memorization

In [70]:
from sklearn.base import BaseEstimator, TransformerMixin


class MemoryTagger(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y):
        '''
        Expects a list of words as X and a list of tags as y.
        '''
        voc = {}
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1
                else:
                    voc[x][t] = 1
            else:
                voc[x] = {t: 1}
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get)
    
    def predict(self, X, y=None):
        '''
        Predict the the tag from memory. If word is unknown, predict 'O'.
        '''
        return [self.memory.get(x, 'O') for x in X]

In [18]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

words = train["Word"].values.tolist()
tags = train["tag"].values.tolist()

pred = cross_val_predict(estimator=MemoryTagger(), X=words, y=tags, cv=5)


In [19]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

               precision    recall  f1-score   support

B-indications       0.72      0.40      0.52     53003
I-indications       0.59      0.37      0.46     44624
            O       0.99      1.00      0.99   4446206

    micro avg       0.98      0.98      0.98   4543833
    macro avg       0.77      0.59      0.66   4543833
 weighted avg       0.98      0.98      0.98   4543833



In [20]:
tagger = MemoryTagger()

In [21]:
tagger.fit(words, tags)

In [22]:
# print(tagger.predict(testgetter.get_next()))

In [23]:
testwords = test["Word"].values.tolist()

In [24]:
pred = tagger.predict(testwords)

In [25]:
set(pred)

{'B-indications', 'I-indications', 'O'}

In [26]:
len(pred)
set(pred)
test.shape

2994463

{'B-indications', 'I-indications', 'O'}

(2994463, 4)

In [27]:
set(subm['tag'])

{'O'}

In [28]:
subm['tag']=pred
subm.to_csv('submission.csv',index=False)

In [29]:
set(subm['tag'])
subm.head()

{'B-indications', 'I-indications', 'O'}

Unnamed: 0,id,Sent_ID,tag
0,4543834,191283,O
1,4543835,191283,O
2,4543836,191283,O
3,4543837,191283,O
4,4543838,191283,O


machine learning approach

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
def feature_map(word):
    '''Simple feature map.'''
    return np.array([ word.islower(), word.isupper(), len(word),
                     word.isdigit(),  word.isalpha()])

In [32]:
words = [feature_map(str(w)) for w in train["Word"].values.tolist()]

In [57]:
words

[array([0, 0, 7, 0, 1]),
 array([1, 0, 2, 0, 1]),
 array([0, 0, 4, 0, 0]),
 array([1, 0, 3, 0, 1]),
 array([ 0,  0, 13,  0,  0]),
 array([0, 0, 9, 0, 1]),
 array([0, 0, 1, 0, 0]),
 array([0, 0, 6, 0, 1]),
 array([0, 0, 1, 0, 0]),
 array([0, 0, 7, 0, 1]),
 array([0, 0, 1, 0, 0]),
 array([1, 0, 3, 0, 1]),
 array([0, 0, 8, 0, 1]),
 array([ 0,  0, 10,  0,  1]),
 array([0, 0, 1, 0, 0]),
 array([0, 0, 2, 0, 1]),
 array([1, 0, 4, 0, 1]),
 array([1, 0, 8, 0, 1]),
 array([1, 0, 3, 0, 1]),
 array([ 1,  0, 11,  0,  1]),
 array([1, 0, 8, 0, 1]),
 array([1, 0, 2, 0, 1]),
 array([1, 0, 6, 0, 1]),
 array([1, 0, 6, 0, 1]),
 array([0, 0, 1, 0, 0]),
 array([1, 0, 3, 0, 1]),
 array([1, 0, 6, 0, 1]),
 array([0, 0, 1, 0, 0]),
 array([1, 0, 3, 0, 1]),
 array([1, 0, 7, 0, 1]),
 array([ 1,  0, 10,  0,  1]),
 array([1, 0, 3, 0, 1]),
 array([ 1,  0, 10,  0,  1]),
 array([1, 0, 7, 0, 1]),
 array([0, 0, 1, 0, 0]),
 array([1, 0, 2, 0, 1]),
 array([1, 0, 4, 0, 1]),
 array([1, 0, 2, 0, 1]),
 array([1, 0, 4, 0, 1]),


In [33]:
pred = cross_val_predict(RandomForestClassifier(n_estimators=20),
                         X=words, y=tags, cv=5)

In [34]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

               precision    recall  f1-score   support

B-indications       0.00      0.00      0.00     53003
I-indications       0.00      0.00      0.00     44624
            O       0.98      1.00      0.99   4446206

    micro avg       0.98      0.98      0.98   4543833
    macro avg       0.33      0.33      0.33   4543833
 weighted avg       0.96      0.98      0.97   4543833



In [73]:
from sklearn.preprocessing import LabelEncoder

class FeatureTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.memory_tagger = MemoryTagger()
        self.tag_encoder = LabelEncoder()
        
    def fit(self, X):
        words = X["Word"].values.tolist()
        tags = X["tag"].values.tolist()
        self.memory_tagger.fit(words, tags)
        self.tag_encoder.fit(tags)
        return self
    
    def transform(self, X, y=None):
        words = X["Word"].values.tolist()
        out = []
        for i in range(len(words)):
            w = str(words[i])
            if i < len(words) - 1:
                wp = self.tag_encoder.transform(self.memory_tagger.predict([words[i+1]]))[0]
            else:
                wp = self.tag_encoder.transform(['O'])[0]
            if i > 0:
                if words[i-1] != ".":
                    wm = self.tag_encoder.transform(self.memory_tagger.predict([words[i-1]]))[0]
                else:
                    wm = self.tag_encoder.transform(['O'])[0]
            else:
                wm = self.tag_encoder.transform(['O'])[0]
            out.append(np.array([w.istitle(), w.islower(), w.isupper(), len(w), w.isdigit(), w.isalpha(),
                                 self.tag_encoder.transform(self.memory_tagger.predict([w]))[0], wp, wm]))
        return out

In [44]:
from sklearn.pipeline import Pipeline

In [46]:
pred = cross_val_predict(Pipeline([("feature_map", FeatureTransformer()), 
                                   ("clf", RandomForestClassifier(n_estimators=20, n_jobs=3))]),
                         X=train, y=tags, cv=3)

KeyboardInterrupt: 

In [None]:
'ss'

In [62]:
clf = lgb.LGBMClassifier()
mod = clf.fit(words, tags)

In [63]:
testwords = [feature_map(str(w)) for w in test["Word"].values.tolist()]

In [77]:
# testwords = [FeatureTransformer().transform(w) for w in test["Word"].values.tolist()]
aa = FeatureTransformer().fit(train)
testw = aa.transform(test)

In [64]:
# testwords

In [65]:
pred = mod.predict(testwords)

In [66]:
set(pred)

{'O'}

In [50]:
subm['tag']=pred
subm.to_csv('submission_lgb.csv',index=False)

In [19]:
### Tfidf with lightgbm

In [90]:
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
# Transform each text into a vector of word counts
vectorizer = TfidfVectorizer(stop_words="english",ngram_range=(1, 2))
training_features = vectorizer.fit_transform(train["Word"])    
test_features = vectorizer.transform(test["Word"])

In [91]:
from sklearn import preprocessing 
le = preprocessing.LabelEncoder()
aa = le.fit_transform(train['tag'])

In [92]:
model = lgb.LGBMClassifier()
model.fit(training_features, aa)
y_pred = model.predict(test_features)
print(set(y_pred))

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

{0, 1, 2}


In [93]:
subm['tag']=le.inverse_transform(y_pred)
subm.to_csv('submission_lgb1.csv',index=False)

In [94]:
subm.head()

Unnamed: 0,id,Sent_ID,tag
0,4543834,191283,O
1,4543835,191283,O
2,4543836,191283,O
3,4543837,191283,O
4,4543838,191283,O
