In [None]:
def load_glove_embeddings(glove_file_path):
    """
    Loads GloVe embeddings into a dictionary with robust error handling.
    """
    embeddings = {}
    with open(glove_file_path, "r", encoding="utf-8") as f:
        for line_number, line in enumerate(f, start=1):
            parts = line.strip().split()
            word = parts[0]

            try:
                vector = list(map(float, parts[1:]))
                embeddings[word] = vector
            except ValueError as e:
                print(f"Error parsing line {line_number}: {line.strip()} - {e}")

    return embeddings


glove_file_path = "glove.840B.300d.txt"
glove_embeddings = load_glove_embeddings(glove_file_path)

In [2]:
import numpy as np
import pandas as pd
import ast

In [3]:
df = pd.read_csv("Word_Tokenized_TRAINING_DATASET.csv")
df["tokenized_text"] = df["tokenized_text"].apply(ast.literal_eval)

In [4]:
def get_article_embedding(tokenized_text, embedding_dict):
    word_embeddings = []
    for sentence in tokenized_text:
        for word in sentence:
            if word in embedding_dict:
                word_embeddings.append(embedding_dict[word])

    return np.mean(word_embeddings, axis=0)

In [5]:
df["article_embedding"] = df["tokenized_text"].apply(
    get_article_embedding, embedding_dict=glove_embeddings
)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [6]:
df = df.dropna()

In [7]:
df = df[["label", "article_embedding"]]
df

Unnamed: 0,label,article_embedding
0,0,"[-0.022733239806414653, 0.1039745327147765, -0..."
1,1,"[-0.00011748223086896195, 0.12993715434500663,..."
2,0,"[-0.08255255649572685, 0.10741406329772099, -0..."
3,0,"[-0.03371140306748469, 0.12805397914110428, -0..."
4,1,"[-0.03214481517857146, 0.16008699241071428, -0..."
...,...,...
69293,1,"[-0.19786133333333333, 0.17139316666666668, -0..."
69294,1,"[0.08409574117647059, 0.008679176470588237, 0...."
69295,1,"[-0.12152158064516128, 0.17065751612903227, 0...."
69296,0,"[0.13795325, 0.011615249999999994, 0.056750666..."


In [8]:
df["article_embedding"] = df["article_embedding"].apply(
    lambda x: x.tolist() if isinstance(x, np.ndarray) else x
)

In [9]:
df.to_csv(
    "embedded_datasets/GloVe/GloVe_Embedded_Dataset.csv",
    encoding="utf-8",
    index=False,
)

# Generalization dataset


In [11]:
df = pd.read_csv("Word_Tokenized_GENERALIZATION_DATASET.csv")
df["tokenized_text"] = df["tokenized_text"].apply(ast.literal_eval)

In [12]:
df["article_embedding"] = df["tokenized_text"].apply(
    get_article_embedding, embedding_dict=glove_embeddings
)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [13]:
df = df.dropna()

In [15]:
df = df[["label", "article_embedding"]]
df

Unnamed: 0,label,article_embedding
0,0,"[-0.053752294114244516, 0.07634543051337686, -..."
1,0,"[-0.02040734840579709, 0.13278928281573504, -0..."
2,1,"[0.008755360566448779, 0.16197980239651436, -0..."
3,0,"[-0.04732918329670325, 0.14267663296703312, 0...."
4,1,"[-0.010228732398921843, 0.12861340997304568, -..."
...,...,...
5985,1,"[-0.03298530886052995, 0.09652802845188284, -0..."
5986,0,"[-0.05746439514249245, 0.06366235355593386, -0..."
5987,0,"[-0.04954178220742176, 0.06829223741769763, -0..."
5988,1,"[-0.03130578169406023, 0.11930146348250588, 0...."


In [16]:
df["article_embedding"] = df["article_embedding"].apply(
    lambda x: x.tolist() if isinstance(x, np.ndarray) else x
)

In [17]:
df.to_csv(
    "embedded_datasets/GloVe/GloVe_Embedded_Generalization_Dataset.csv",
    encoding="utf-8",
    index=False,
)