In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle
import numpy as np


In [2]:
# documentation from this site : https://medium.com/analytics-vidhya/text-classification-using-word-embeddings-and-deep-learning-in-python-classifying-tweets-from-6fe644fcfc81


class Embeddings():
    """
    A class to read the word embedding file and to create the word embedding matrix
    """

    def __init__(self, path, vector_dimension):
        self.path = path 
        self.vector_dimension = vector_dimension
    
    @staticmethod
    def get_coefs(word, *arr): 
        return word, np.asarray(arr, dtype='float32')

    def get_embedding_index(self):
        embeddings_index = dict(self.get_coefs(*o.split(" ")) for o in open(self.path, errors='ignore')) # extract embedding vectors from our glove dataset
        return embeddings_index

    def create_embedding_matrix(self, tokenizer, max_features):
        
        """
        A method to create the embedding matrix
        """
        model_embed = self.get_embedding_index()

        embedding_matrix = np.zeros((max_features + 1, self.vector_dimension))
        for word, index in tokenizer.word_index.items():
            if index > max_features:
                break
            else:
                try:
                    embedding_matrix[index] = model_embed[word]
                except:
                    continue
        return embedding_matrix
    
class TextToTensor():
    
    def __init__(self, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len

    def string_to_tensor(self, string_list: list) -> list:
        """
        A method to convert a string list to a tensor for a deep learning model
        """    
        string_list = self.tokenizer.texts_to_sequences(string_list)
        string_list = pad_sequences(string_list, maxlen=self.max_len)
        print('string_list',string_list)
        return string_list

In [3]:
def compute_tokenization_embedding(X_train,Y_train,embed_path,embed_dim=200,max_len=20,num_words=200000):
    # Preprocecing the text
    print('Preprocecing the text')

    Y_train = np.asarray(Y_train)


    # Tokenizing the text
    print('Tokenizing the text')
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    if len(tokenizer.word_counts)<num_words:
        num_words=len(tokenizer.word_counts)
    tokenizer.num_words=num_words

    #with open('tokenizer.pickle', 'wb') as handle:
    #    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # Creating the embedding matrix
    print('Creating the embedding matrix')
    embedding = Embeddings(embed_path, embed_dim)

    embedding_matrix = embedding.create_embedding_matrix(tokenizer, num_words)

    save=False # if we want to save the embedding matrix and the tokenizer 
    if save : 
        with open('tokenizer.pickle', 'wb') as handle:
            pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        np.save('embeddings_matrix', embedding_matrix)


In [15]:
# Get the data
X_train_pos=[]
Y_train_pos=[]
print('get data')
f_pos=open('train_pos_full_lem.txt',encoding ="latin-1")
for line in f_pos:
    X_train_pos.append(line.strip())
    Y_train_pos.append(1)

X_train_neg=[]
Y_train_neg=[]
f_neg=open('train_neg_full_lem.txt',encoding ="latin-1")
for line in f_neg:
    X_train_neg.append(line.strip())
    Y_train_neg.append(0)


X_train=X_train_pos+X_train_neg
Y_train=Y_train_pos+Y_train_neg
Y_train=Y_train

    

# shuffle the data : 
import random
random.seed(10)

data = list(zip(X_train, Y_train))

random.shuffle(data)

X_train, Y_train = zip(*data)

max_len = 25
embed_path='../data/glove.twitter.27B.200d.txt'

compute_tokenization_embedding(
X_train,
Y_train,
embed_path,
embed_dim=25,
max_len=max_len,
num_words=200000
)

get data
I AM HERE 
Preprocecing the text
Y train after [1 1 1 ... 0 1 0]
Tokenizing the text
Creating the embedding matrix
