# Load and preprocess training data

### imports

In [1]:
import sys
import numpy as np
import pickle
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sys.path.append('../')
from utils import *


In [2]:
MAX_TWEET_LENGTH = 30 # the longest tweet in the set is 31


## Load the training data

In [3]:
data = pd.read_csv('../data/train.csv')
tweets = data.values[:, -2]
labels = data.values[:, -1].astype(float)


In [16]:
word_to_vector, words_to_index, index_to_words = load_dictionary()


## Create an input matrix

In [17]:
tokenizer = Tokenizer()
tokenizer.word_index = words_to_index

tweets = tokenizer.texts_to_sequences(tweets)
tweets = pad_sequences(tweets,
                       padding='post',
                       truncating='post',
                       maxlen=MAX_TWEET_LENGTH)


## Save the training input

In [8]:
input_data = {
    'X': tweets,
    'y': labels
}

pickle.dump(input_data, open('../data/training_input.pkl', 'wb'))


## Function to load the training input

In [9]:
def load_training_input() -> tuple:
    training_input = pickle.load(open('../data/training_input.pkl', 'rb'))
    X, y = training_input['X'], training_input['y']
    return X, y
    