<img src="unicamp.png" width="150" height="150">

# Modelling

In [1]:
# Default
import os

# Numerical and IO
import numpy as np
import pandas as pd

# NLP and model
import pickle
import nltk
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split

## Read Data

In [2]:
folder = 'augmented/'
files_path = os.listdir(folder)
files = {}

# Get in order
for i in range(1, len(files_path)+1):
    
    file = [f for f in files_path if f'{i}' == f.split('_')[0]][0]
    files[file] = pd.read_csv(folder+file)
    files[file].columns = ['question', 'answer']
    files[file].dropna(inplace=True)

### Create corpus

In [None]:
corpus = []

for f in files:
    
    for indx, row in files[f].iterrows():
        
        corpus.append(row['question'])
        corpus.append(row['answer'])

### IO objects

In [3]:
def save(file, data):
    
    folder = 'pickles/'
    with open(f'{folder}{file}.pickle', 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load(file):

    folder = 'pickles/'
    with open(f'{folder}{file}.pickle', 'rb') as handle:
        pick = pickle.load(handle)
    
    return pick

## Train Sentence Embedding

Source of sentence embedding Doc2Vec: https://www.analyticsvidhya.com/blog/2020/08/top-4-sentence-embedding-techniques-using-python/

In [None]:
# Tokenize
tokenized = [word_tokenize(c.lower()) for c in corpus]

# Tag
tagged = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized)]

# Train model
model = Doc2Vec(tagged, vector_size=50, window=2, min_count=1, epochs=100)

In [None]:
save('doc2vec', model)

In [None]:
# test_doc = word_tokenize("I had pizza and pasta".lower())
# test_doc_vector = model.infer_vector(test_doc)

In [4]:
# If restarted notebook
model = load('doc2vec')

## Prepare Data

Prepare data for hierarquical classifier.
- Which subject?
- Which question?

In [5]:
# Which subject?
class_sub = {}
X_sub, y_sub = [], []

for i, f in enumerate(files):

    class_sub[str(i)] = f
    X_sub.extend(files[f]['question'].values)
    y_sub.extend([str(i)]*len(files[f]['question'].values))

In [6]:
# Which question?
class_que = {}
X_que, y_que = [], []
count = 0
last = ''

for i, f in enumerate(files):
    
    for j, row in files[f].iterrows():
        
        que = row['question']
        ans = row['answer']
        
        if last != ans:
            last = ans
            count += 1
        
        class_que[str(count)] = ans
        X_que.append(que)
        y_que.append(str(count))

### Clean data

In [7]:
X_sub = [x.replace('\n', '') for x in X_sub]
X_que = [x.replace('\n', '') for x in X_que]

In [8]:
len(X_sub), len(y_sub), len(X_que), len(y_que)

(800, 800, 800, 800)

## Modelling

### Subject

In [9]:
def split_data(X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [12]:
X_train, X_test, y_train, y_test = split_data(X_sub, y_sub)