<img src="unicamp.png" width="150" height="150">

# Modelling

In [None]:
# Default
import os

# Numerical and IO
import numpy as np
import pandas as pd

# NLP
import pickle
import nltk
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Models
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
import tensorflow as tf
from tensorflow import keras

import warnings
warnings.filterwarnings("ignore")

## Read Data

In [None]:
folder = 'augmented/'
files_path = os.listdir(folder)
files = {}

# Get in order
for i in range(1, len(files_path)+1):
    
    file = [f for f in files_path if f'{i}' == f.split('_')[0]][0]
    files[file] = pd.read_csv(folder+file)
    files[file].columns = ['question', 'answer']
    files[file].dropna(inplace=True)

### Create corpus

In [None]:
corpus = []

for f in files:
    
    for indx, row in files[f].iterrows():
        
        corpus.append(row['question'])
        corpus.append(row['answer'])

### IO objects

In [None]:
def save(file: str, data):
    
    folder = 'pickles/'
    with open(f'{folder}{file}.pickle', 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load(file: str):

    folder = 'pickles/'
    with open(f'{folder}{file}.pickle', 'rb') as handle:
        pick = pickle.load(handle)
    
    return pick

## Train Sentence Embedding

Source of sentence embedding Doc2Vec: https://www.analyticsvidhya.com/blog/2020/08/top-4-sentence-embedding-techniques-using-python/

In [None]:
# Tokenize
tokenized = [word_tokenize(c.lower()) for c in corpus]

# Tag
tagged = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized)]

# Train model
model = Doc2Vec(tagged, vector_size=50, window=2, min_count=1, epochs=100)

In [None]:
save('doc2vec', model)

In [None]:
# If restarted notebook
model = load('doc2vec')

## Prepare Data

Prepare data for hierarquical classifier.
- Which subject?
- Which question?

In [None]:
# Which subject?
class_sub = {}
X_sub, y_sub = [], []

for i, f in enumerate(files):

    class_sub[str(i+1)] = f
    X_sub.extend(files[f]['question'].values)
    y_sub.extend([str(i+1)]*len(files[f]['question'].values))

save('class_sub', class_sub)

In [None]:
# Which question?
class_que = {}
X_que, y_que = [], []
count = 0
last = ''

for i, f in enumerate(files):
    
    for j, row in files[f].iterrows():
        
        que = row['question']
        ans = row['answer']
        
        if last != ans:
            last = ans
            count += 1
        
        class_que[str(count)] = ans
        X_que.append(que)
        y_que.append(str(count))

save('class_que', class_que)

### Clean data

In [None]:
X_sub = [x.replace('\n', '') for x in X_sub]
X_que = [x.replace('\n', '') for x in X_que]

In [None]:
len(X_sub), len(y_sub), len(X_que), len(y_que)

## Modelling

In [None]:
def split_data(X: list, y: list):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [None]:
def get_embedding(text: str):
    
    tokenized = word_tokenize(text.lower())
    return model.infer_vector(tokenized)

In [None]:
def make_it_vector(X: list):
    
    vectors = []
    
    for text in X:
        vectors.append(get_embedding(text))
        
    return vectors

In [None]:
def evaluate(model, X, y):
    
    return cross_val_score(model, X, y, cv=5, scoring='f1_macro')

In [None]:
def get_network(classes_len):
    
    model = keras.Sequential([
        keras.layers.Dense(256, activation='relu', input_dim=50),
        keras.layers.Dense(256, activation='relu'),
        keras.layers.Dense(256, activation='relu'),
        keras.layers.Dense(256, activation='relu'),
        keras.layers.Dense(classes_len, activation='softmax')
    ])

    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

In [None]:
def one_hot_label(ys):
    
    y_ohe = []
    ys = list(map(int, ys))
    maxi = np.amax(ys)
    
    for y in ys:
        y_ohe.append([1 if i == y else 0 for i in range(maxi)])
    
    return np.array(y_ohe)

### Subject

In [None]:
# Split data
X_train, X_test, y_train, y_test = split_data(X_sub, y_sub)

# Make X vector
X_train_vec = make_it_vector(X_train)
X_test_vec = make_it_vector(X_test)

In [None]:
# Create Model
rf_model = RandomForestClassifier(random_state=42)

# Evaluate
evaluate(rf_model, X_train_vec, y_train)

# Fit and test
rf_model.fit(X_train_vec, y_train)
pred = rf_model.predict(X_test_vec)
f1_score(y_test, pred, average='macro')

In [None]:
# Save the model
save('sub_model', rf_model)

In [None]:
# Create Model
mlp_model = MLPClassifier(max_iter=300, random_state=42)

# Evaluate
evaluate(mlp_model, X_train_vec, y_train)

# Fit and test
mlp_model.fit(X_train_vec, y_train)
pred = mlp_model.predict(X_test_vec)
f1_score(y_test, pred, average='macro')

In [None]:
# Create Model
gb_model = GradientBoostingClassifier(random_state=42)

# Evaluate
evaluate(gb_model, X_train_vec, y_train)

# Fit and test
gb_model.fit(X_train_vec, y_train)
pred = gb_model.predict(X_test_vec)
f1_score(y_test, pred, average='macro')

In [None]:
# Create Model
lr_model = LogisticRegression(random_state=42)

# Evaluate
evaluate(lr_model, X_train_vec, y_train)

# Fit and test
lr_model.fit(X_train_vec, y_train)
pred = lr_model.predict(X_test_vec)
f1_score(y_test, pred, average='macro')

In [None]:
# model = get_network(16)

# X_train_nn = np.array(X_train_vec)
# X_test_nn = np.array(X_test_vec)
# y_train_ohe = one_hot_label(y_train)
# y_test_ohe = one_hot_label(y_test)

# history = model.fit(X_train_nn, y_train_ohe, epochs=100, validation_data=(X_test_nn, y_test_ohe))

### Question

In [None]:
subject = 16

start = (subject-1)*50
end = start + 50

X_que_filt = np.array(X_que)[start:end]
y_que_filt = np.array(y_que)[start:end]

In [None]:
# Split data
X_train, X_test, y_train, y_test = split_data(X_que_filt, y_que_filt)

# Make X vector
X_train_vec = make_it_vector(X_train)
X_test_vec = make_it_vector(X_test)

In [None]:
# Create Model
rf_model = RandomForestClassifier(random_state=42)

# Evaluate
evaluate(rf_model, X_train_vec, y_train)

# Fit and test
rf_model.fit(X_train_vec, y_train)
pred = rf_model.predict(X_test_vec)
f1_score(y_test, pred, average='macro')

In [None]:
# Save the model
save(f'{subject}_que_model', rf_model)

In [None]:
# Create Model
mlp_model = MLPClassifier(max_iter=300, random_state=42)

# Evaluate
evaluate(mlp_model, X_train_vec, y_train)

# Fit and test
mlp_model.fit(X_train_vec, y_train)
pred = mlp_model.predict(X_test_vec)
f1_score(y_test, pred, average='macro')

In [None]:
# Create Model
gb_model = GradientBoostingClassifier(random_state=42)

# Evaluate
evaluate(gb_model, X_train_vec, y_train)

# Fit and test
gb_model.fit(X_train_vec, y_train)
pred = gb_model.predict(X_test_vec)
f1_score(y_test, pred, average='macro')

In [None]:
# Create Model
lr_model = LogisticRegression(random_state=42)

# Evaluate
evaluate(lr_model, X_train_vec, y_train)

# Fit and test
lr_model.fit(X_train_vec, y_train)
pred = lr_model.predict(X_test_vec)
f1_score(y_test, pred, average='macro')

## Compare and Select