In [1]:
import ast
import zipfile

import pandas as pd
import numpy as np
from numpy import zeros

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

# pip install gensim
import gensim
import gensim.models
from gensim.models import Word2Vec

# Reading data

In [2]:
train_data = pd.read_csv("../data/train_data_cleaning.csv")

In [3]:
train_data['content_ready'] = train_data['content_ready'].apply(ast.literal_eval)

In [4]:
test_data = pd.read_csv("../data/test_data_cleaning.csv")

In [5]:
test_data['content_ready'] = test_data['content_ready'].apply(ast.literal_eval)

# tf-idf 

In [6]:
# Create the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(analyzer=lambda x: x)

# Fit the vectorizer to the documents and transform the documents into TF-IDF vectors
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['content_ready'])
X_test_tfidf = tfidf_vectorizer.transform(test_data['content_ready'])

feature_names = tfidf_vectorizer.get_feature_names_out()

In [7]:
sparse.save_npz('../data/X_train_tfidf.npz', X_train_tfidf)
sparse.save_npz('../data/X_test_tfidf.npz', X_test_tfidf)

# word2vac

In [8]:
def get_doc_vec(words, w2vec_model):
    """
    Function to take a document as a list of words 
    and return the document vector
    
    Arg:
        - words: a list of words
        - w2vec_model: vector of vocabularies
    """
    presented_words = []
    for word in words:
        if word in w2vec_model.wv.key_to_index:
            presented_words.append(word)
    if presented_words:
        word_vectors = [w2vec_model.wv[word] for word in presented_words]
        return np.mean(word_vectors, axis=0)
    else: return None

In [9]:
# Train Word2Vec Model with the corpus
corpus = train_data['content_ready']

size_vect = 100
size_window = 15
ch_sg = 1 # skip-gram
min_word_cnt = 10

# build the model with the entire corpus
model = gensim.models.word2vec.Word2Vec(corpus
                            , min_count = min_word_cnt
                            , vector_size = size_vect
                            , window = size_window
                            , sg = ch_sg
                            , workers = 5)


In [10]:
train_data['w2vec'] = train_data['content_ready'].apply(lambda sent : get_doc_vec(sent, model))
test_data['w2vec'] = test_data['content_ready'].apply(lambda sent : get_doc_vec(sent, model))

In [11]:
train_data['w2vec']

0       [-0.021934662, -0.045313407, -0.20713057, -0.0...
1       [-0.01478289, 0.21770811, -0.21053442, 0.06472...
2       [-0.010699491, -0.011163153, -0.246315, 0.0106...
3       [-0.023718074, 0.046120156, -0.12290713, -0.04...
4       [-0.16364671, 0.061136227, -0.16094913, -0.043...
                              ...                        
9131    [-0.10141687, -0.30549508, -0.54992586, 0.0652...
9132    [0.0094024455, -0.055430166, -0.007694498, 0.0...
9133    [0.117975764, 0.11803543, 0.029490937, -0.0583...
9134    [0.017669259, -0.12720896, -0.18396588, -0.150...
9135    [-0.05806205, 0.023231583, -0.34358263, 0.1442...
Name: w2vec, Length: 9136, dtype: object

In [12]:
def w2v_normalization(x_data, w2vec_column, size_vector):
    '''
    x_data: either x_train or x_val
    y_data: either y_train or y_val
    '''
    # Data Normalization
    x_np_vecs = np.zeros((len(x_data), size_vector))
    for i, vec in enumerate(w2vec_column):
        x_np_vecs[i, :] = vec

    # Combine the full dataframe with the labels
    x_data_w2v = pd.DataFrame(data = x_np_vecs
                              , index = x_data.index)

    # Join train data with label data in order to remove NaN values
    x_data_w2v = x_data_w2v.dropna()
    return x_data_w2v

In [13]:
x_train_w2v = w2v_normalization(train_data['content_ready'], train_data['w2vec'], size_vect)
x_test_w2v = w2v_normalization(test_data['content_ready'], test_data['w2vec'], size_vect)

In [14]:
# Save DataFrames to CSV
x_train_w2v.to_csv('../data/x_train_w2v.csv', index=False)
x_test_w2v.to_csv('../data/x_test_w2v.csv', index=False)

# Glove

In [15]:
glove_embeddings = dict()

with zipfile.ZipFile('../data/glove.6B.100d.txt.zip', 'r') as zip_ref:
    with zip_ref.open('glove.6B.100d.txt') as f:
        for line in f:
            values = line.decode().split()
            word = values[0]
            coefs = [float(val) for val in values[1:]]
            glove_embeddings[word] = coefs

In [16]:
print('Loaded %s word vectors.' % len(glove_embeddings))

Loaded 400000 word vectors.


In [17]:
# Function to map words to GloVe embeddings
def map_words_to_embeddings(text, embeddings_index, embedding_dim):
    embedded_text = []
    for word in text:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedded_text.append(embedding_vector)
    if embedded_text:
        return np.mean(embedded_text, axis=0)  # Average word embeddings in the text
    else:
        return np.zeros(embedding_dim)  # Use zero vector if no embeddings found


In [18]:
vectorized_train_data = []
for text in train_data['content_ready']:
    vectorized_text = map_words_to_embeddings(text, glove_embeddings, 100)
    vectorized_train_data.append(vectorized_text)

vectorized_train_data = np.array(vectorized_train_data)

In [None]:
vectorized_test_data = []
for text in test_data['content_ready']:
    vectorized_text = map_words_to_embeddings(text, glove_embeddings, 100)
    vectorized_test_data.append(vectorized_text)

vectorized_test_data = np.array(vectorized_test_data)

In [None]:
np.savez_compressed('../data/vectorized_train_data.npz', 
                    data=vectorized_train_data)

np.savez_compressed('../data/vectorized_test_data.npz', 
                    data=vectorized_test_data)