In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler
from keras.models import load_model
from keras.initializers import glorot_normal, Zeros, Ones
import keras.backend as K
from keras.optimizers import RMSprop
import tensorflow as tf

In [None]:
%%time
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import random

from sklearn import preprocessing
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.decomposition import PCA
from math import sqrt
from scipy import stats
from scipy.stats import norm, skew #for some statistics

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import os
os.listdir('../input/')

In [None]:
train = pd.read_csv('../input/train_3pirksi/train.csv')
test = pd.read_csv('../input/test_xev14ad/test.csv')
subm = pd.read_csv('../input/sample_submission_usrypcc/sample_submission.csv')

In [None]:
train = train.fillna(method="ffill")
test = test.fillna(method="ffill")

In [None]:
test.head(5)

In [None]:
words = list(set(train["Word"].values))
n_words = len(words); n_words

In [None]:
tags = list(set(train["tag"].values))

In [None]:
n_tags = len(tags); n_tags

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
    
    def get_next(self):
        try:
            s = self.data[self.data["Sent_ID"] == self.n_sent]
            self.n_sent += 1
            return s["Word"].values.tolist(), s["tag"].values.tolist()    
        except:
            self.empty = True
            return None, None

In [None]:
class TestSentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 191283
        self.data = data
        self.empty = False
    
    def get_next(self):
        try:
            s = self.data[self.data["Sent_ID"] == self.n_sent]
            self.n_sent += 1
            return s["Word"].values.tolist()   
        except:
            self.empty = True
            return None

In [None]:
getter = SentenceGetter(train)

In [None]:
testgetter = TestSentenceGetter(test)
testgetter.get_next()

In [None]:
sent, tag = getter.get_next()

In [None]:
print(sent)
print(tag)

Memorization

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


class MemoryTagger(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y):
        '''
        Expects a list of words as X and a list of tags as y.
        '''
        voc = {}
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1
                else:
                    voc[x][t] = 1
            else:
                voc[x] = {t: 1}
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get)
    
    def predict(self, X, y=None):
        '''
        Predict the the tag from memory. If word is unknown, predict 'O'.
        '''
        return [self.memory.get(x, 'O') for x in X]

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

words = train["Word"].values.tolist()
tags = train["tag"].values.tolist()

pred = cross_val_predict(estimator=MemoryTagger(), X=words, y=tags, cv=5)


In [None]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

In [None]:
tagger = MemoryTagger()

In [None]:
tagger.fit(words, tags)

In [None]:
# print(tagger.predict(testgetter.get_next()))

In [None]:
testwords = test["Word"].values.tolist()

In [None]:
pred = tagger.predict(testwords)

In [None]:
set(pred)

In [None]:
len(pred)
set(pred)
test.shape

In [None]:
set(subm['tag'])

In [None]:
subm['tag']=pred
subm.to_csv('submission.csv',index=False)

In [None]:
set(subm['tag'])
subm.head()

machine learning approach

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
def feature_map(word):
    '''Simple feature map.'''
    return np.array([ str(word).istitle(),word.islower(), word.isupper(), len(word),
                     word.isdigit(),  word.isalpha()])

In [None]:
words = [feature_map(w) for w in train["Word"].values.tolist()]

In [None]:
from sklearn.model_selection import cross_val_predict
pred = cross_val_predict(RandomForestClassifier(n_estimators=20),
                         X=words, y=tags, cv=3)

In [None]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.memory_tagger = MemoryTagger()
        self.tag_encoder = LabelEncoder()
        
    def fit(self, X, y):
        words = X["Word"].values.tolist()
        tags = X["tag"].values.tolist()
        self.memory_tagger.fit(words, tags)
        self.tag_encoder.fit(tags)
        return self
    
    def transform(self, X, y=None):
        words = X["Word"].values.tolist()
        out = []
        for i in range(len(words)):
            w = str(words[i])
            if i < len(words) - 1:
                wp = self.tag_encoder.transform(self.memory_tagger.predict([words[i+1]]))[0]
            else:
                wp = self.tag_encoder.transform(['O'])[0]
            if i > 0:
                if words[i-1] != ".":
                    wm = self.tag_encoder.transform(self.memory_tagger.predict([words[i-1]]))[0]
                else:
                    wm = self.tag_encoder.transform(['O'])[0]
            else:
                wm = self.tag_encoder.transform(['O'])[0]
            out.append(np.array([w.istitle(), w.islower(), w.isupper(), len(w), w.isdigit(), w.isalpha(),
                                 self.tag_encoder.transform(self.memory_tagger.predict([w]))[0], wp, wm]))
        return out

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pred = cross_val_predict(Pipeline([("feature_map", FeatureTransformer()), 
                                   ("clf",lgb.LGBMClassifier())]),
                         X=train, y=tags, cv=2)

In [None]:
'aaa'

In [None]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)