In [1]:
import numpy as np
import pandas as pd
from itertools import product
from collections import Counter
import re
from sklearn.svm import SVC, LinearSVC
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict
import scipy.stats as stats
import matplotlib.pylab as plt
from scipy.sparse import csr_matrix, vstack, hstack
from keras.layers import GRU, LSTM, Dense, Flatten, Dropout, Merge, InputLayer, Input
from keras.models import Sequential
from keras import regularizers
%matplotlib inline

Using Theano backend.


In [2]:
data = []
with open('task2_lemmas_train', 'r') as infile:
    infile.readline()
    for line in infile:
        fields = line.strip().split(',')
        word = fields[1]
        lemmas = []
        for i in range(2, len(fields)):
            lemmas.append(tuple(fields[i].split('+')))
        data.append((word, lemmas))
        
        

In [91]:
bases = []
endings = defaultdict(lambda: defaultdict(list))
e = []
for x in data:
    word = x[0]
    for lemma, pos in x[1]:
        i = 0
        while i < min(len(word), len(lemma)) and lemma[i] == word[i]:
            i += 1
        
        bases.append(word[:i])
        e.append(lemma[i:])
        #endings[(word[i:], pos)][word[i - 3:i]].append(lemma[i:])
        endings[(word[i:], pos)][word[i - 2:i]].append(lemma[i:])
        endings[(word[i:], pos)][word[i - 1:i]].append(lemma[i:])
        endings[(word[i:], pos)]['$'].append(lemma[i:])

In [100]:
suffixes = [s for s, _ in Counter(e).most_common(13)]

In [116]:
endings = {a: {c: Counter(d).most_common(1)[0][0] for c, d in b.items()} for a, b in endings.items()}

In [None]:
endings

In [102]:
letters = np.unique(list(''.join([data[i][0] for i in range(len(data))])))
letters = list(letters) + ['$']
#letters = [letter for letter in letters if letter.lower() == letter]
letter_index = {letter: index for index, letter in enumerate(letters)}
n = len(letters)

In [110]:
model = Sequential()
model.add(LSTM(100, input_shape=(30, n)))
model.add(Dense(3 + len(suffixes), activation='sigmoid'))
model.compile(loss="mse", optimizer="rmsprop")

In [107]:
X = []
y = []
for word, lemmas in data:
    word_features = []
    w = word + '$' * (30 - len(word))
    for i in range(len(w)):
        letter_features = np.zeros(n)
        letter_features[letter_index[w[i]]] = 1

        word_features.append(letter_features)
    X.append(np.array(word_features))
    
    label = np.zeros(3 + len(suffixes))
    for lemma, pos in lemmas:
        if pos == 'V':
            label[0] = 1
        elif pos == 'A':
            label[1] = 1
        elif pos == 'N':
            label[2] = 1
         
    for lemma, pos in lemmas:
        i = 0
        while i < min(len(word), len(lemma)) and lemma[i] == word[i]:
            i += 1
            
        if lemma[i:] in suffixes:
            label[3 + suffixes.index(lemma[i:])] = 1
        
            
    y.append(np.array(label))
X = np.array(X)
y = np.array(y)

In [108]:
words_train, words_test, X_train, X_test, y_train, y_test = train_test_split([word for word, _ in data], X, y, train_size=0.75)

In [109]:
for i in range(5):
    model.fit(X_train, y_train, batch_size=128, nb_epoch=1)
    
    y_pred = model.predict(X_train) > 0.5
    print(np.mean([a1 == a2 for a1, a2 in zip(y_pred, y_train)]))
    
    y_pred = model.predict(X_test) > 0.5
    print(np.mean([a1 == a2 for a1, a2 in zip(y_pred, y_test)]))

Epoch 1/1
0.945743425489
0.946719066082
Epoch 1/1
 2432/88980 [..............................] - ETA: 124s - loss: 0.0418

KeyboardInterrupt: 

In [111]:
model.fit(X, y, batch_size=32, nb_epoch=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
   864/118640 [..............................] - ETA: 245s - loss: 0.0245

KeyboardInterrupt: 

In [112]:
test_data = []
with open('task2_lemmas_test', 'r') as infile:
    infile.readline()
    for line in infile:
        fields = line.strip().split(',')
        word = fields[1]
        test_data.append(word)

In [113]:
X = []
y = []
for word in test_data:
    word_features = []
    w = word + '$' * (30 - len(word))
    for i in range(len(w)):
        letter_features = np.zeros(n)
        letter_features[letter_index[w[i]]] = 1

        word_features.append(letter_features)
    X.append(np.array(word_features))
X = np.array(X)

In [114]:
y_pred = model.predict(X)

In [117]:
ans = []
for word, a in zip(test_data, y_pred):
    pos = np.argmax(a)
    if pos == 0:
        pos = 'V'
    elif pos == 1:
        pos = 'A'
    else:
        pos = 'N'
        
        
    i = 0
    while (word[i:], pos) not in endings:
        i += 1
    if max(a[3:]) > 0.5:
        ans.append(word[:i] + suffixes[np.argmax(a[3:])] + '+' + pos)
    else:
        if word[i - 2:i] in endings[(word[i:], pos)]:
            ans.append(word[:i] + endings[(word[i:], pos)][word[i - 2:i]] + '+' + pos)
        elif word[i - 1:i] in endings[(word[i:], pos)]:
            ans.append(word[:i] + endings[(word[i:], pos)][word[i - 1:i]] + '+' + pos)
        else:
            ans.append(word[:i] + endings[(word[i:], pos)]['$'] + '+' + pos)

In [120]:
res = pd.DataFrame({'Id': np.arange(len(ans)) + 1, 'Category': ans})

In [121]:
res[['Id', 'Category']].to_csv('ans.csv', index=None)

In [57]:
print(list(zip(words_test, ans)))

[('spolvereran', 'spolverare'), ('appassionata', 'appassionare'), ('spiumereste', 'spiumare'), ('spertichiate', 'sperticare'), ('vetrioleggereste', 'vetrioleggiare'), ('costai', 'coste'), ('scarruffammo', 'scarruffare'), ('rilegati', 'rilegare'), ('ribattiate', 'ribattare'), ('ergesse', 'ergrre'), ('fendente', 'fendire'), ('diradai', 'dirade'), ('esorbiterebbero', 'esorbitare'), ('condividevamo', 'condividrre'), ('dimissionando', 'dimissionare'), ('appannamento', 'appannamento'), ('handicapparono', 'handicappare'), ('sopravvalutarono', 'sopravvalutare'), ('incuneer\xc3\xa0', 'incuneare'), ('polarizzeremo', 'polarizzare'), ('associn', 'assocare'), ('inconsistente', 'inconsistire'), ('ruminassero', 'ruminare'), ('discerneremo', 'discerrre'), ('piallereste', 'piallare'), ('deploriamo', 'deplorare'), ('discesisti', 'discesire'), ('ruminan', 'ruminire'), ('autocontrollassi', 'autocontrollare'), ('oberereste', 'oberare'), ('derubrichiate', 'derubricare'), ('dissanguarono', 'dissanguare'), ('

In [15]:
sum((y_pred > 0.5)[:, 2])

5

In [42]:
len(np.unique(bases))

26670

In [39]:
sorted([word for word in np.unique(endings)])

['_A',
 '_N',
 '_V',
 'a-dati_N',
 'a-estate_N',
 'a-gol_N',
 'a-italia_N',
 'a-lavoro_N',
 'a-ordinanza_N',
 'a-paese_N',
 'a-paga_N',
 'a-partito_N',
 'a_A',
 'a_N',
 'adonna_N',
 'aforte_N',
 'an_N',
 'are_V',
 'arsi_V',
 'asorte_N',
 'buono_A',
 'cere_V',
 'consistere_V',
 'dere_V',
 'desistere_V',
 'e-quadro_N',
 'e-spia_N',
 'e_A',
 'e_N',
 'e_V',
 'edere_A',
 'edere_V',
 'enere_V',
 'enire_V',
 'ere_V',
 'ernere_V',
 'esistere_V',
 'essere_V',
 'ettere_V',
 'gere_V',
 'ggere_V',
 'gliere_V',
 'grande_A',
 'guere_V',
 'ia_N',
 'iare_V',
 'igere_V',
 'imere_V',
 'ingere_V',
 'io_A',
 'io_N',
 'ire_A',
 'ire_V',
 'lere_V',
 'mpere_V',
 'ndere_V',
 'nere_V',
 'nire_V',
 'o-chiave_N',
 'o-leninismo_N',
 'o-radar_N',
 'o-stato_N',
 'o_A',
 'o_N',
 'oclan_N',
 'ogruppo_N',
 'olere_V',
 'olista_N',
 'omorta_N',
 'ompere_V',
 'ondere_V',
 'opattuglia_N',
 'oprassedere_A',
 'oprassedere_V',
 'ore_A',
 'orire_V',
 'otere_V',
 'oviro_N',
 'percuotere_V',
 're_A',
 're_V',
 'rere_V',
 'riper