In [2]:
import numpy as np
import pandas as pd

In [48]:
data = np.array(pd.read_csv("Dataset/Text_Emotion_Data.csv"))
stopwords = np.array(pd.read_csv("Dataset/stopwords.txt", sep=" ", header=None)).flatten()

In [20]:
X = data[:, 0]
y = data[:, 1]

In [32]:
import re

def remove_none_alpha(x):
    regex = re.compile('[^a-zA-Z ]')
    return regex.sub('', x)

In [33]:
stopwords = [remove_none_alpha(w) for w in stopwords]

In [69]:
X_seq = []
for sentence in X:
    words_seq = remove_none_alpha(sentence).lower().split(" ")
    words_seq = [w for w in words_seq if len(w) > 2]
    words_seq = [w for w in words_seq if w not in stopwords]
    X_seq.append(words_seq)

In [122]:
max_len = len(max(X_seq, key=len))

# create a new list of lists with the desired size, padding with empty strings
sequences = [['' for _ in range(max_len)] for _ in range(len(X_seq))]

# copy the elements from the original lists into the new list, padding with empty strings as needed
for i, sequence in enumerate(X_seq):
    sequences[i][:len(sequence)] = sequence

In [173]:
my_list = [1, 2, 3]
threshold = 6

my_list = [0] * (threshold - len(my_list)) + my_list

print(my_list)

[0, 0, 0, 1, 2, 3]


In [175]:
import numpy as np

def one_hot_encode(sequence, word_to_idx):
    num_words = len(word_to_idx)
    encoding = np.zeros((len(sequence), num_words))
    for i, word in enumerate(sequence):
        if word in word_to_idx:
            encoding[i, word_to_idx[word]] = 1
    return encoding

def one_hot_encode2(sequence, word_to_idx, max_bit_num):
    encoding = np.zeros((len(sequence), max_bit_num))
    for i, word in enumerate(sequence):
        if word in word_to_idx:            
            binary_string = bin(word_to_idx[word])[2:]  # exclude the "0b" prefix
            binary_array = np.array([int(digit) for digit in binary_string])
            binary_array = np.pad(binary_array, (0, max_bit_num - len(binary_array)), 'constant')

            encoding[i, :] = binary_array
    return encoding

unique_words = list(set([word for sequence in sequences for word in sequence]))
word_to_idx = {word: idx for idx, word in enumerate(unique_words)}

max_bit_num = 1
while(True):
    if(2**max_bit_num > len(unique_words)): break
    max_bit_num += 1

encoded_sequences = []
for sequence in sequences:
    encoded_sequences.append(one_hot_encode2(sequence, word_to_idx, max_bit_num))


In [121]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Flatten the list of sequences and create a pandas DataFrame
words = pd.DataFrame([word for sequence in sequences for word in sequence], columns=['word'])

# Create a OneHotEncoder object
encoder = OneHotEncoder()

# Fit the encoder to the DataFrame
encoder.fit(words)

# Transform the DataFrame using the fitted encoder
transformed = encoder.transform(words).toarray()

# Split the transformed array back into the original sequences
offsets = [0] + [len(sequence) for sequence in sequences]
offsets = [sum(offsets[:i]) for i in range(len(offsets))]
transformed_sequences = [transformed[offsets[i]:offsets[i+1]] for i in range(len(offsets)-1)]

# Print the transformed sequences
print(transformed_sequences)


[array([], shape=(0, 8), dtype=float64), array([[0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.]])]


In [109]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Create a set of all unique words in the sequences
unique_words = set()
for sequence in sequences:
    unique_words.update(sequence)

# Create a mapping from words to indices
word_to_index = {word: index for index, word in enumerate(unique_words)}

# Create a new list to hold the one-hot encoded vectors
encoded_sequences = []

# Encode each sequence in the list using one-hot encoding
for sequence in sequences:
    encoded_sequence = np.zeros((len(sequence), len(unique_words)))
    for i, word in enumerate(sequence):
        index = word_to_index[word]
        encoded_sequence[i][index] = 1
    encoded_sequences.append(encoded_sequence)

# Create a OneHotEncoder object for future use
encoder = OneHotEncoder()

# Fit the encoder to the encoded sequences to obtain the categories
encoder.fit(np.concatenate(encoded_sequences))

# Transform the encoded sequences using the fitted encoder
transformed_sequences = [encoder.transform(encoded_sequence).toarray() for encoded_sequence in encoded_sequences]

# Print the transformed sequences
print(transformed_sequences)


KeyboardInterrupt: 

In [None]:
X_train = X[:-150]
X_test = X[-150:]
y_train = y[:-150]
y_test = y[-150:]