In [1]:
import numpy as np
import pandas as pd
from gensim.models import word2vec
import nltk
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout
from keras.layers import LSTM, GRU
from keras.layers import TimeDistributed
from keras.layers import Bidirectional
from keras.optimizers import RMSprop, Adagrad
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from keras import callbacks

from collections import Counter
import tqdm
from tqdm import tqdm_notebook
from functools import reduce
import re

Using TensorFlow backend.


In [2]:
df = pd.concat([
    pd.read_csv('train.csv',index_col='id'),
    pd.read_csv('test.csv',index_col='id')
    ],axis=0)
df['part'] = df['author'].isnull().apply(lambda x: 'test' if x else 'train')
print('Количество примеров:',len(df),', доля обучающей выборки:',"{0:.2f}%".format(100*np.mean(df['part']=='train')))
df.sample(5)

Количество примеров: 27971 , доля обучающей выборки: 70.00%


Unnamed: 0_level_0,author,text,part
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id06505,MWS,"This, O this may be the last time that you wil...",train
id26151,MWS,"I then moved forward, and a murmuring sound ar...",train
id15861,HPL,They believed all sorts of things and maybe th...,train
id03329,EAP,"I accordingly went over to the plantation, and...",train
id12429,,Having perceived at last the hollowness and fu...,test


In [7]:
def CleanData(df):
    c = Counter()

    for ind, row in tqdm_notebook(df.iterrows(), total = df.shape[0], desc = 'Build punctuation dict'):
        c += Counter(re.sub('\w+','',row.text))

    dct = dict(c)
    dct.pop(' ')

    for i in dct.keys():
        dct[i]=''

    for index, row in tqdm_notebook(df.iterrows(), total = df.shape[0], desc = 'Clean texts'):
        df.loc[index,'text'] = reduce(lambda x, y: x.replace(y, dct[y]), dct, row.text)

    return df

In [8]:
data = CleanData(df)







In [9]:
df = data

In [10]:
tkn = Tokenizer(lower=True)
tkn.fit_on_texts(df.text)
df['tokens'] = tkn.texts_to_sequences(df.text)
max_text_len = max(df.tokens.apply(len))
fix_text_len = 861
print('Максимальная длина текста:',max_text_len,' слов')
df['tokens'] = list(sequence.pad_sequences(df['tokens'].values, maxlen=fix_text_len))
df.sample(10)

Максимальная длина текста: 861  слов


Unnamed: 0_level_0,author,text,part,tokens
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
id17242,MWS,While there is life there is action and change,train,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id07258,EAP,I would be an editor I would be a poet I would...,train,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id07591,,Bribery threats and intrigue soon discovered t...,test,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id06262,,Soon after my arrival my father spoke of my im...,test,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id02077,,Tabitha Turnip indeed Oh the little wretch But...,test,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id17702,EAP,Arose early and to my great joy at length behe...,train,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id15538,EAP,It seemed to have been constructed for no espe...,train,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id21030,,There was nothing now to prevent my getting do...,test,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id05992,,She described in vivid terms the ceaseless car...,test,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id06014,HPL,His right hand fell on one of the projecting f...,train,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [11]:
max_features = len(tkn.word_index)
print('Размер словаря:',max_features)

Размер словаря: 28727


In [12]:
test_x = df[df['part']=='test']['tokens'].apply(list).tolist()
train = df[df['part']=='train']

In [13]:
train_x,valid_x,train_y,valid_y = train_test_split(train['tokens'].apply(list).tolist(),pd.get_dummies(train['author']))

In [None]:
train['tokens'].apply(list).tolist()

In [14]:
train_y.columns

Index(['EAP', 'HPL', 'MWS'], dtype='object')

In [23]:
batch_size = 32

print('Build model...')
model = Sequential()
model.add(Embedding(max_features+1, 30 ,mask_zero=True))
#model.add(LSTM(60,return_sequences=True,kernel_initializer='he_normal'))
model.add(LSTM(60))#, dropout=0.3, recurrent_dropout=0.1,kernel_initializer='he_normal'))
model.add(Dropout(0.2))
model.add(Dense(30, activation='relu'))
model.add(Dense(3, activation='softmax'))

###model = Sequential()
###model.add(Embedding(max_features+1, 10 ,mask_zero=True))
#model.add(Bidirectional(LSTM(100, return_sequences=True),input_shape=(5, 10)))
#model.add(Dropout(0.8))
###model.add(GRU(30))
###model.add(Dense(20))
###model.add(Dropout(0.8))
###model.add(Dense(3, activation = 'softmax'))

# try using different optimizers and different optimizer configs

lr = 0.01

model.compile(loss='categorical_crossentropy',
              #optimizer='adam',
              optimizer=Adagrad(lr=lr),
              #optimizer=RMSprop(),
              metrics=['categorical_accuracy'])

print('Train...')
earlystop = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=1, mode='auto')
model.fit(
    train_x,
    train_y.values,
    batch_size=batch_size,
    validation_data=(valid_x, valid_y.values),
    verbose=2,
    epochs=10,
    callbacks=[earlystop]
)

Build model...
Train...
Train on 14684 samples, validate on 4895 samples
Epoch 1/10
  128/14684 [..............................] - ETA: 897s - loss: 1.1007 - categorical_accuracy: 0.25 - ETA: 654s - loss: 1.0963 - categorical_accuracy: 0.34 - ETA: 571s - loss: 1.0854 - categorical_accuracy: 0.40 - ETA: 542s - loss: 1.0911 - categorical_accuracy: 0.3984

KeyboardInterrupt: 

In [82]:
for i in range(1):
    print(i,'Train...')
    model.fit(
        train_x,
        train_y.values,
        batch_size=batch_size,
        validation_data=(valid_x, valid_y.values),
        verbose=1,
        epochs=1
    )

0 Train...
Train on 14684 samples, validate on 4895 samples
Epoch 1/1


In [18]:
score, acc = model.evaluate(valid_x, valid_y.values,batch_size=batch_size,)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.477756488433
Test accuracy: 0.827783452381


In [19]:
cols = list(train_y.columns)
print(cols)
test = df[df['part']=='test']#
test_x = test['tokens'].apply(list).tolist()
pred = model.predict_proba(test_x,verbose=False)
for i,e in enumerate(cols):
    test[e] = pred[:,i]
test.head()

['EAP', 'HPL', 'MWS']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0_level_0,author,text,part,tokens,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
id02310,,Still as I urged our leaving Ireland with such...,test,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.211296,0.019615,0.76909
id24541,,If a fire wanted fanning it could readily be f...,test,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.996583,0.001558,0.001859
id00134,,And when they had broken down the frail door t...,test,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.00414,0.995284,0.000576
id27757,,While I was thinking how I should possibly man...,test,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.964054,0.033287,0.00266
id04081,,I am not sure to what limit his knowledge may ...,test,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.584219,0.271716,0.144065


In [20]:
test[train_y.columns].to_csv('rnn_adagrad_121117_01.csv')

In [16]:
model.save('./models/rnn_stas_adagrad_031117.dat')

In [21]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb

# set parameters:
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

Loading data...
Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 400)
x_test shape: (25000, 400)
Build model...
Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2

KeyboardInterrupt: 

## ConvNN

In [3]:
df = pd.concat([
    pd.read_csv('train.csv',index_col='id'),
    pd.read_csv('test.csv',index_col='id')
    ],axis=0)
df['part'] = df['author'].isnull().apply(lambda x: 'test' if x else 'train')

In [4]:
tkn = Tokenizer(lower=True)
tkn.fit_on_texts(df.text)
df['tokens'] = tkn.texts_to_sequences(df.text)
max_text_len = max(df.tokens.apply(len))
fix_text_len = 100
print('Максимальная длина текста:',max_text_len,' слов')
df['tokens'] = list(sequence.pad_sequences(df['tokens'].values, maxlen=fix_text_len))
df.sample(10)

Максимальная длина текста: 861  слов


Unnamed: 0_level_0,author,text,part,tokens
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
id02290,MWS,Chapter Clerval then put the following letter ...,train,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id10551,,And Kuranes reigned thereafter over Ooth Narga...,test,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id27961,,Different spectators of the game would advise ...,test,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id00293,HPL,"Whither he has gone, I do not know; but I have...",train,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id06263,HPL,"Meanwhile poor Rhoby Harris, in her madness, g...",train,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id25131,MWS,"""You talk of the future,"" she said, ""while the...",train,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id08563,MWS,I was already well acquainted with what I may ...,train,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id15471,,"I threw myself upon my face, and clung to the ...",test,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id12199,EAP,You never saw a more brilliant metallic lustre...,train,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
id09636,MWS,"A short month has destroyed a village, and whe...",train,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [5]:
test_x = df[df['part']=='test']['tokens'].apply(list).tolist()
train = df[df['part']=='train']

In [6]:
train_x,valid_x,train_y,valid_y = train_test_split(train['tokens'].apply(list).tolist(),pd.get_dummies(train['author']))

In [18]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb

# set parameters:
max_features = 100000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

print('Loading data...')
#(train_x, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(train_x), 'train sequences')
print(len(valid_x), 'test sequences')

print('Pad sequences (samples x time)')
train_x = sequence.pad_sequences(train_x, maxlen=maxlen)
valid_x = sequence.pad_sequences(valid_x, maxlen=maxlen)
print('train_x shape:', train_x.shape)
print('valid_x shape:', valid_x.shape)

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(3))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])

print('Train...')
model.fit(
    train_x, 
    train_y.values,
    batch_size=batch_size,
    validation_data=(valid_x, valid_y.values),
    verbose=1,
    epochs=epochs,
)

Loading data...
14684 train sequences
4895 test sequences
Pad sequences (samples x time)
train_x shape: (14684, 400)
valid_x shape: (4895, 400)
Build model...
Train...
Train on 14684 samples, validate on 4895 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2460a9801d0>

In [19]:
cols = list(train_y.columns)
print(cols)
test = df[df['part']=='test']#
test_x = test['tokens'].apply(list).tolist()
pred = model.predict_proba(test_x,verbose=False)
for i,e in enumerate(cols):
    test[e] = pred[:,i]
test.head()

['EAP', 'HPL', 'MWS']


ValueError: Error when checking : expected embedding_4_input to have shape (None, 400) but got array with shape (8392, 100)

In [None]:
pred

AttributeError: 'list' object has no attribute 'shape'

In [None]:
test[train_y.columns].to_csv('cnn_adam_081117_02.csv')

In [11]:
import nltk
from nltk.util import ngrams

def word_grams(words, min=1, max=3):
    s = []
    for n in range(min, max+1):
        for ngram in ngrams(words, n):
            s.append(' '.join(str(i) for i in ngram))
    return s

print(word_grams('one two three four'.split(' ')))

['one', 'two', 'three', 'four', 'one two', 'two three', 'three four', 'one two three', 'two three four']


In [9]:
import re
re.sub("[^\w+]","",df.text[0])

'ThisprocesshoweveraffordedmenomeansofascertainingthedimensionsofmydungeonasImightmakeitscircuitandreturntothepointwhenceIsetoutwithoutbeingawareofthefactsoperfectlyuniformseemedthewall'

In [26]:
input_str = df.text[0]

words = [re.sub("[^\w+]","",str(input_str).split('","')[ind]) for ind in range(len(str(input_str).split('","')))]

In [37]:
re.split('^[a-zA-Z]',df.text[0])

['',
 'his process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.']

In [6]:


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

def CleanData(train, test):
    c = Counter()
    for df in [train, test]:
        for ind, row in tqdm_notebook(df.iterrows(), total = df.shape[0], desc = 'Build punctuation dict'):
            c += Counter(re.sub('\w+','',row.text))

    dct = dict(c)
    dct.pop(' ')

    for i in dct.keys():
        dct[i]=''

    for df in [train, test]:
        for index, row in tqdm_notebook(df.iterrows(), total = df.shape[0], desc = 'Clean texts'):
            df.text[index] = reduce(lambda x, y: x.replace(y, dct[y]), dct, df.text[index])

    return train, test

In [7]:
train, test = CleanData(train, test)

Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





In [8]:
train.text[0]

'This process however afforded me no means of ascertaining the dimensions of my dungeon as I might make its circuit and return to the point whence I set out without being aware of the fact so perfectly uniform seemed the wall'

In [12]:
word_grams(test.text[0].split(' '))

['Still',
 'as',
 'I',
 'urged',
 'our',
 'leaving',
 'Ireland',
 'with',
 'such',
 'inquietude',
 'and',
 'impatience',
 'my',
 'father',
 'thought',
 'it',
 'best',
 'to',
 'yield',
 'Still as',
 'as I',
 'I urged',
 'urged our',
 'our leaving',
 'leaving Ireland',
 'Ireland with',
 'with such',
 'such inquietude',
 'inquietude and',
 'and impatience',
 'impatience my',
 'my father',
 'father thought',
 'thought it',
 'it best',
 'best to',
 'to yield',
 'Still as I',
 'as I urged',
 'I urged our',
 'urged our leaving',
 'our leaving Ireland',
 'leaving Ireland with',
 'Ireland with such',
 'with such inquietude',
 'such inquietude and',
 'inquietude and impatience',
 'and impatience my',
 'impatience my father',
 'my father thought',
 'father thought it',
 'thought it best',
 'it best to',
 'best to yield']

In [None]:
tkn = Tokenizer(lower=True)
tkn.fit_on_texts(df.text)
df['tokens'] = tkn.texts_to_sequences(df.text)
max_text_len = max(df.tokens.apply(len))
fix_text_len = 100
print('Максимальная длина текста:',max_text_len,' слов')
df['tokens'] = list(sequence.pad_sequences(df['tokens'].values, maxlen=fix_text_len))
df.sample(10)