In [1]:
# Import the libraries
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Import the data
labeled_data = pd.read_csv('../../labeled_data.csv')[0:1000]
unlabeled_data_text = pd.read_csv('../../unlabeled_data.csv')[0:1000]
test_data_text = pd.read_csv('../../test_data.csv')[0:1000]

# Get just the text data from the labeled data
labeled_data_text = labeled_data[['text']]
labeled_data_text.head(n=5)

# Concatenate all the text data
text_df = pd.concat([labeled_data_text, unlabeled_data_text, test_data_text])

# Convert from df to list
text = text_df['text'].tolist()

# Confirm the length
len(text)

3000

In [14]:
# Tokenize the text
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(text)

# Create a squence from the tokens
sequences = tokenizer.texts_to_sequences(text)

In [15]:
# Have a look at the first sequence
print(sequences[0])

[1, 161, 2057, 12, 6, 37, 22, 31, 284, 10, 5, 155, 65, 22, 330, 124, 31, 22, 2058, 136, 354, 6, 16, 40, 2059, 5, 752, 5298, 14, 9, 8838, 72, 3235, 10, 1, 437, 1563, 6, 118, 673, 12, 14, 6, 6, 3, 36, 14, 16, 971, 1, 1662, 779, 17, 394, 156, 11, 1, 361, 6, 6, 2162, 1819, 22, 78, 114, 1204, 11, 1081, 4, 2944, 5, 155, 15, 1732, 71, 37, 1, 284, 682, 12, 589, 972, 22, 23, 4, 377, 54, 4, 2944, 8839, 6, 6, 6, 19, 33, 112, 79, 59, 320, 11, 13, 1450, 6526, 250, 22, 4521, 4522, 8840, 456, 10, 5, 207, 2534, 6, 6, 457, 879, 3, 164, 5299, 17, 2724, 8841, 6, 6, 8842, 1413]


In [13]:
# Have a look at the word to index mapping
tokenizer.word_index

{'the': 1,
 'and': 2,
 'i': 3,
 'to': 4,
 'a': 5,
 '\r': 6,
 'was': 7,
 'of': 8,
 'it': 9,
 'for': 10,
 'in': 11,
 'is': 12,
 'my': 13,
 'that': 14,
 'but': 15,
 'they': 16,
 'with': 17,
 'we': 18,
 'this': 19,
 'not': 20,
 'on': 21,
 'you': 22,
 'have': 23,
 'had': 24,
 'were': 25,
 'at': 26,
 'so': 27,
 'food': 28,
 'be': 29,
 'good': 30,
 'are': 31,
 'me': 32,
 'place': 33,
 'as': 34,
 'there': 35,
 'like': 36,
 'if': 37,
 'out': 38,
 'all': 39,
 'just': 40,
 'very': 41,
 'here': 42,
 'service': 43,
 'would': 44,
 'when': 45,
 'one': 46,
 'get': 47,
 'time': 48,
 'great': 49,
 'their': 50,
 'our': 51,
 'from': 52,
 'back': 53,
 'up': 54,
 'or': 55,
 'an': 56,
 "it's": 57,
 'he': 58,
 'no': 59,
 'go': 60,
 'what': 61,
 'she': 62,
 'about': 63,
 'really': 64,
 'which': 65,
 'will': 66,
 'been': 67,
 'because': 68,
 "don't": 69,
 'more': 70,
 'only': 71,
 'some': 72,
 'them': 73,
 'us': 74,
 'by': 75,
 'your': 76,
 'got': 77,
 'can': 78,
 'do': 79,
 'also': 80,
 'even': 81,
 'after': 8

In [28]:
# Add padding to the sequences
MAX_SEQUENCE_LENGTH = 100
data = pad_sequences(sequences, 
                     maxlen=MAX_SEQUENCE_LENGTH,
                     padding='pre') # Add padding to the start if needs padding