# This notebook performs all data preprocessing needed

## 0 - Defining parameters

Here you define all parameter of the pipeline

In [1]:
# CONTEXT_SIZE is the number of tokens that you will considerate to predict the next
CONTEXT_SIZE = 5

# VOCAB_LEN is the max unique tokens to be considered in the vocab dict
VOCAB_LEN = 70000

## 1 - Importing libs

Here you will import all necessary libs

In [15]:
from src.data import *

import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
import pickle

## 2 - Reading raw dataset

Here the raw dataset is loaded

In [3]:
# Read dataset
data = pd.read_csv("../deliver/train/train.csv")
data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


## 3 - Preprocessing raw dataset

In [4]:
# Printing some examples
unique_tokens = ' '.join(data['text'].values).lower()
train_tokens = unique_tokens.split()
print(len(train_tokens))

523356


In [5]:
# Printing how many unique tokens is there
print('Length of unique tokens: ', len(set(train_tokens)))

Length of unique tokens:  44895


In [6]:
print('First 10 tokens:',train_tokens[:10])

First 10 tokens: ['this', 'process,', 'however,', 'afforded', 'me', 'no', 'means', 'of', 'ascertaining', 'the']


In [7]:
# Effectively building our vocabulary
vocab = build_vocab(train_tokens, vocab_size=VOCAB_LEN)

# Adding the 3 author in vocab
vocab['EAP'] = len(vocab) 
vocab['MWS'] = len(vocab)
vocab['HPL'] = len(vocab)

print(f"Encode for EAP: {vocab['EAP']} MWS: {vocab['MWS']} HPL: {vocab['HPL']}")

print(f'Vocab has {len(vocab)} tokens')
print(f'10 sample tokens: {list(itertools.islice(vocab.keys(), 10))}')

Encode for EAP: 44897 MWS: 44898 HPL: 44899
Vocab has 44900 tokens
10 sample tokens: ['the', 'of', 'and', 'to', 'a', 'i', 'in', 'was', 'that', 'my']


In [8]:
# Just an auxiliary "transposed" vocabulary to generate words

vocab_t = defaultdict(list)
for k, v in vocab.items():
    vocab_t[v].append(k)

In [9]:
# Exporting vocabulary to further usage
with open('obj/vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)
    
with open('obj/vocab_t.pkl', 'wb') as f:
    pickle.dump(vocab_t, f)

In [10]:
# Example of usage of get_ngrams function
example = 'hey my name is lucas and i am pleasure to meet you! i really like how you are'
example = example.split()
encoded_tokens = get_ngrams(example, vocab ,n = 6)

print("ENCODED TOKENS:\n\n", encoded_tokens)

print("\n\nDECODED TOKENS:", 35*' ', "TARGET TOKEN","\n\n")
for i,f in enumerate(encoded_tokens[0]):
    print(tokens2word(list(f), vocab_t), 20*' ',tokens2word([encoded_tokens[1][i]], vocab_t))

ENCODED TOKENS:

 ([[44896, 44896, 44896, 44896, 44896, 33093], [44896, 44896, 44896, 44896, 33093, 9], [44896, 44896, 44896, 33093, 9, 304], [44896, 44896, 33093, 9, 304, 23], [44896, 33093, 9, 304, 23, 44895], [33093, 9, 304, 23, 44895, 2], [9, 304, 23, 44895, 2, 5], [304, 23, 44895, 2, 5, 118], [23, 44895, 2, 5, 118, 803], [44895, 2, 5, 118, 803, 3], [2, 5, 118, 803, 3, 833], [5, 118, 803, 3, 833, 44895], [118, 803, 3, 833, 44895, 5], [803, 3, 833, 44895, 5, 423], [3, 833, 44895, 5, 423, 76], [833, 44895, 5, 423, 76, 113], [44895, 5, 423, 76, 113, 35]], [9, 304, 23, 44895, 2, 5, 118, 803, 3, 833, 44895, 5, 423, 76, 113, 35, 56])


DECODED TOKENS:                                     TARGET TOKEN 


<pad> <pad> <pad> <pad> <pad> hey                      my
<pad> <pad> <pad> <pad> hey my                      name
<pad> <pad> <pad> hey my name                      is
<pad> <pad> hey my name is                      <unk>
<pad> hey my name is <unk>                      and
hey my name is 

## 4 - Generating and exporting dataset 

In [11]:
l, t, a = generate_df(data, vocab, vocab_t, len_context = CONTEXT_SIZE)

In [12]:
len(l)

523356

In [13]:
df = pd.DataFrame({'context': l, 'target': t, 'author': a})
df.head()

Unnamed: 0,context,target,author
0,"[44896, 44896, 44896, 44896, 44897]",44895,44897
1,"[44896, 44896, 44896, 44897, 44895]",8206,44897
2,"[44896, 44896, 44897, 44895, 8206]",141,44897
3,"[44896, 44897, 44895, 8206, 141]",1330,44897
4,"[44897, 44895, 8206, 141, 1330]",30,44897


In [16]:
# Spliting in train and validation dataset
df_train, df_val = train_test_split(df, test_size = 0.1, random_state = 42, shuffle= True, stratify=df.author)

df_val_authors = df_val.author

In [18]:
# Exporting as .csv to further usage

df_train.to_csv("../data/df_train.csv", index = False)
df_val.to_csv("../data/df_val.csv", index = False)

Now you must have "df_train.csv" and "df_val.csv" files in your "data" folder.