In [None]:
import pandas as pd
import numpy
from keras.datasets import imdb
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dropout, Dense
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
import numpy as np
numpy.random.seed(10)
import spacy
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support, f1_score
from keras.preprocessing.text import Tokenizer, text_to_word_sequence, one_hot
from keras.callbacks import Callback, ModelCheckpoint
import pickle
from keras.models import model_from_json
import itertools

In [None]:
class F1_Score(Callback):
    def __init__(self, training_data):
        self.training_data = training_data
        super().__init__() 

    def on_train_begin(self, logs={}):
        self.train_f1s = []
        self.val_f1s = []

    def on_epoch_end(self, epoch, logs={}):
        train_predict = np.rint(self.model.predict(self.training_data[0]))
        train_targ = self.training_data[1]
        _train_f1 = f1_score(train_targ, train_predict, average='weighted')
        self.train_f1s.append(_train_f1)
        print(" — train_f1: %.4f" % _train_f1)
        
        val_predict = np.rint(self.model.predict(self.validation_data[0]))
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict, average='weighted')
        print(classification_report(val_targ, val_predict))
        self.val_f1s.append(_val_f1)
        print(" — val_f1: %.4f" % _val_f1)
        return

In [None]:

def train_test(df, proportion=[.7,.9]):
    train_df = pd.DataFrame() 
    dev_df = pd.DataFrame() 
    test_df = pd.DataFrame() 
    exercises_names = df['exercise.name'].unique()
    for exercise in exercises_names:
        df_exer = df[df['exercise.name'] == exercise]
        train, dev  = np.split(df_exer.sample(frac=1), [int(.7*len(df_exer))])
        print(train.shape, dev.shape, exercise)
        train_df = train_df.append(train, ignore_index=True)
        dev_df = dev_df.append(dev, ignore_index=True)
    return train_df, dev_df

In [None]:
import os
os.listdir('.')

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
#submissions = pd.read_pickle('mumuki_io_FINAL_anotado.pkl')
submissions = pd.read_pickle('IntroAlgo_final.pkl')

In [None]:
submissions['target'] = submissions.apply(lambda x: 1 if x['dropout_tray'] else 0, axis=1)

In [None]:
train_df, test_df = train_test(submissions)

In [None]:
X_train = train_df['content'].values
Y_train = train_df['target'].values
X_test = test_df['content'].values
Y_test = test_df['target'].values

In [None]:
#tokenizer = Tokenizer()
#tokenizer = Tokenizer(filters='', lower=False)
tokenizer = Tokenizer(num_words=35000, oov_token="<UNK>", lower=False, filters='')
tokenizer.fit_on_texts(X_train)

In [None]:
#Save tokenizer
with open('tokenizer-intro.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

#load tokenizer
#with open('tokenizerIntro.pickle', 'rb') as handle:
#    tokenizer = pickle.load(handle)

In [None]:
files.download('test_df_intro_30.pkl')

In [None]:
max_length = 100
top_words = 5000
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)


### prepare data for Network

In [None]:
X_train = sequence.pad_sequences(X_train, maxlen=max_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_length)

In [None]:
!pip install tensorboardcolab

In [None]:
from tensorboardcolab import *
tbc=TensorBoardColab()

In [None]:
import tensorflow as tf
from keras import backend as K


config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.7
sess = tf.Session(config=config)
K.set_session(sess)

callbacks = [F1_Score((X_train, Y_train)), ModelCheckpoint("introAlgo.{epoch:02d}-{val_loss:.2f}.h5", monitor="val_loss", save_best_only=False, period=1)]

# create the model
embedding_vecor_length = 256
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_length))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
res = model.fit(np.array(X_train), Y_train, validation_data=(np.array(X_test), Y_test), epochs=20, batch_size=32, callbacks=callbacks)

# Zona de prueba
No prestar mucha atencion

In [None]:
import os
os.listdir('.')
files.download('introAlgo.10-0.44.h5')

In [None]:
model = load_model('introAlgo.29-0.76.h5')

In [None]:
#compile loaded model
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model = loaded_model

In [None]:
test_df['predicted_classes'] = model.predict_classes(sequence.pad_sequences(tokenizer.texts_to_sequences(test_df['content']),maxlen=max_length))
test_df['predicted'] = model.predict(sequence.pad_sequences(tokenizer.texts_to_sequences(test_df['content']),maxlen=max_length))

In [None]:
files.download('introAlgo.06-0.46.h5')

In [None]:
test_df.to_pickle('test-df-IO-256-predicted2.pkl')

In [None]:
files.download('test-df-IO-256-predicted2.pkl')

In [None]:
submissions = pd.read_pickle('test-df-intro-256-predicted2.pkl')

In [None]:
def get_prediction(model, sub_content):
    max_length = 100
    return model.predict_proba(sequence.pad_sequences(sub_content,maxlen=max_length))

In [None]:
for exercise in test_df['exercise.name'].unique():
    print(exercise, test_df[test_df['exercise.name']==exercise].shape[0])

In [None]:
df = test_df[test_df['exercise.name'] == 'esBisiesto']
df[df['target'] == df['predicted_classes']].shape[0]/df.shape[0]

In [None]:
import numpy as np
import matplotlib.pyplot as plt


n, bins, patches = plt.hist(df['predicted'], facecolor='g', alpha=0.75)


plt.grid(True)
plt.show()

In [None]:
submissions[(submissions['target'] == 1) & (submissions['predicted'] >= 0.8)][['content', 'exercise.name','target','predicted_classes', 'predicted']]
# print(submissions[(submissions['predicted'] > 0.3) & (submissions['predicted'] < 0.5)][['content', 'submissions_status','target', 'predicted']].iloc[10,:]['content'])
# print(submissions[(submissions['predicted'] > 0.3) & (submissions['predicted'] < 0.5)][['content', 'submissions_status','target', 'predicted']].iloc[10,:]['target'])
# print(submissions[(submissions['predicted'] > 0.3) & (submissions['predicted'] < 0.5)][['content', 'submissions_status','target', 'predicted']].iloc[10,:]['predicted'])
# submissions.sort_values('predicted', ascending=True)

In [None]:
print(submissions.loc[8725]['predicted_classes'])
print(submissions.loc[8725]['predicted'])
print(submissions.loc[8725]['content'])

for i in tokenizer.texts_to_sequences([submissions.loc[8725]['content']]):
    print(i)

In [None]:
#get_prediction(model, [submissions.loc[8725]['content']])
#sequence.pad_sequences(tokenizer.texts_to_sequences(submissions['content']),maxlen=max_length)

#Tokenizo la solucion
tokenized = tokenizer.texts_to_sequences([submissions.loc[8725]['content']])[0]

#ventada de 1, voy sacando de a un token y padeo 
to_train = [(tokenized[i], tokenized[:i]+tokenized[i+1:]) for i in range(len(tokenized))]

In [None]:
to_train

In [None]:
def print_tokenized(tokenizer, tokens):
    string = ''
    for token in tokens:
        string += tokenizer.index_word[token]+' '
    print(string)

print_tokenized(tokenizer,tokenized)
print(get_prediction(model, [tokenized])[0][0])
print('\n')


for elem in to_train:
    print("Remove token {}".format(tokenizer.index_word[elem[0]]))
    print_tokenized(tokenizer, elem[1])
    print(get_prediction(model, [elem[1]])[0][0])
    print('\n')

# Character Level LSTM

In [None]:

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import Counter
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, BatchNormalization, Dropout
from keras import optimizers
from keras.utils import to_categorical

In [None]:
# some parameters
BATCH_SIZE = 1024  # batch size for the network
EPOCH_NUMBER = 10  # number of epochs to train
THRESHOLD = 5  # symbols appearing fewer times will be replaced by a placeholder

In [None]:
x_train = train_df['content']
y_train = train_df['target']
# An important statistic is the average length of the comment:

x_train.apply(lambda x: len(x)).describe()

In [None]:
# Get counts of unique symbols in the training set:

unique_symbols = Counter()

for _, program in x_train.iteritems():
    unique_symbols.update(program)
    
print("Unique symbols:", len(unique_symbols))
print(unique_symbols)

In [None]:
# Find symbols that appear fewer times than the threshold:
uncommon_symbols = list()

for symbol, count in unique_symbols.items():
    if count < THRESHOLD:
        uncommon_symbols.append(symbol)

print("Uncommon symbols:", len(uncommon_symbols))
print(uncommon_symbols)

In [None]:
# Replace them with a placeholder:

DUMMY = uncommon_symbols[0]
tr_table = str.maketrans("".join(uncommon_symbols), DUMMY * len(uncommon_symbols))

x_train = x_train.apply(lambda x: x.translate(tr_table))

In [None]:
# We will need the number of unique symbols further down when we will decide on the dimensionality of inputs.

num_unique_symbols = len(unique_symbols) - len(uncommon_symbols) + 1 

tokenizer = Tokenizer(
    char_level=True,
    filters=None,
    lower=False,
    num_words=num_unique_symbols
)

tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [None]:
# Pad the input: I use the 100 lenght, just a bit over the median length.

padded_sequences = pad_sequences(sequences, maxlen=100)

# I will take just a bit of the data as the validation set to see that the network converges:

x_train, x_val, y_train, y_val = train_test_split(padded_sequences, y_train, test_size=0.05)

# So, let's define the model!
model = Sequential()
model.add(LSTM(150, input_shape=(100, num_unique_symbols), activation="tanh", return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(LSTM(100, input_shape=(100, num_unique_symbols), activation="tanh"))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Dense(100, activation="tanh"))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Dense(50, activation="tanh"))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Dense(1, activation="sigmoid"))

x_val = to_categorical(np.array(x_val), num_classes=num_unique_symbols)

# Let's track the performance using the custom function that will be used for the leaderboard:

sgd = optimizers.SGD(lr=0.01, momentum=0.9)
#model.compile(optimizer=sgd, loss="binary_crossentropy")
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
def kaggle_loss(y_true, y_pred):
    total_loss = 0
    for i in range(y_true.shape[1]):
        total_loss += log_loss(y_true.iloc[:, i], y_pred[:, i])
    return total_loss / y_true.shape[1]


with open("res_2.txt", "w") as f:
    print(model.summary())
    for epoch in range(EPOCH_NUMBER):
        print("Epoch", epoch)
        for i in range(0, len(x_train), BATCH_SIZE):
            batch = x_train[i:i+BATCH_SIZE]
            batch = to_categorical(batch, num_classes=num_unique_symbols)
            y_batch = y_train.iloc[i:i+BATCH_SIZE]
            model.fit(batch, y_batch, batch_size=256)
        #res = kaggle_loss(y_val, model.predict_proba(x_val))
        print("Loss:", res)
        f.write("{}: {}\n".format(epoch, res))
        model.save("lstm_2_50_ep{}.h5".format(epoch + 1))

# Language Models
Build and train two differents language model one for non dropout sumbissions and another for dropout submissions.

In [None]:
import os
os.listdir('.')

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
submissions = pd.read_pickle('IntroAlgo128-predicted.pkl')

In [None]:
dropout_submissions = submissions[submissions['predicted'] == 1]
dropout_submissions.shape

In [None]:
train_dropout_df, test_dropout_df = train_test(dropout_submissions)

In [None]:
X_train_dropout = train_dropout_df['content'].values
Y_train_dropout = train_dropout_df['target'].values
X_test_dropout = test_dropout_df['content'].values
Y_test_dropout = test_dropout_df['target'].values

In [None]:
data = ''
for elem in X_train_dropout:
    data += elem.strip()+' </s></s> '

In [None]:
print(data)

In [None]:
# integer encode text
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(X_train_dropout)
encoded = tokenizer.texts_to_sequences(X_train_dropout)[0]

In [None]:
tokenizer.word_counts

In [None]:
sorted(tokenizer.word_counts.items(), key=lambda kv: -kv[1])

In [None]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

In [None]:
#tokenizer.texts_to_sequences([data.split('</s></s>')[0]])[0]

In [None]:
# create line-based sequences
sequences = list()
for line in data.split('</s></s>'):
    encoded = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

print('Total Sequences: %d' % len(sequences))

In [None]:
# pad input sequences
max_length = max([len(seq) for seq in sequences])
max_length = 20
sequences = seq.pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

In [None]:
from keras.utils import to_categorical
# split into input and output elements
sequences = np.array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [None]:
sequences

In [None]:
X

In [None]:
y

In [None]:
y.shape

In [None]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=50, verbose=2, batch_size=256)
#model.fit(X[:10000,:], y[:10000,:], epochs=500, verbose=2, batch_size=256)

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("language-model-intro-nondropout-128.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("language-model-intro-nondropout-128.h5")
print("Saved model to disk")

In [None]:
os.listdir('.')
files.download('language-model-intro-nondropout-128.h5')
files.download('language-model-intro-nondropout-128.json')

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
os.listdir('.')

In [None]:
# load json and create model
json_file = open('language-model-intro-128.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
dropout_model = model_from_json(loaded_model_json)
# load weights into new model
dropout_model.load_weights("language-model-intro-128.h5")
print("Loaded model from disk")

In [None]:
# load json and create model
json_file = open('language-model-intro-nondropout-128.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
no_dropout_model = model_from_json(loaded_model_json)
# load weights into new model
no_dropout_model.load_weights("language-model-intro-nondropout-128.h5")
print("Loaded model from disk")

In [None]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = seq.pad_sequences([encoded], maxlen=max_length, padding='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text

In [None]:
print(generate_seq(dropout_model, tokenizer, max_length-1, 'calcular', 10))

In [None]:
print(generate_seq(no_dropout_model, tokenizer, max_length-1, 'calcular', 10))