In [1]:
import numpy as np
import pandas as pd
import ast

In [2]:
from gensim.models import KeyedVectors

fasttext_vectors = KeyedVectors.load_word2vec_format(
    "fasttext_crawl-300d-2M-subword.vec", binary=False
)

In [3]:
df = pd.read_csv("Word_Tokenized_TRAINING_DATASET_SPACY.csv")
df["tokenized_text"] = df["tokenized_text"].apply(ast.literal_eval)

In [4]:
def get_article_embedding(tokenized_text, model):
    word_embeddings = []
    for sentence in tokenized_text:
        for word in sentence:
            if word in model.key_to_index:
                word_embeddings.append(model[word])
    return np.mean(word_embeddings, axis=0)

In [5]:
df["article_embedding"] = df["tokenized_text"].apply(
    get_article_embedding, model=fasttext_vectors
)

In [6]:
df = df.dropna()

In [7]:
df = df[["label", "article_embedding"]]
df

Unnamed: 0,label,article_embedding
0,0,"[0.10095935, 0.076495625, 0.022189142, 0.08946..."
1,1,"[0.10799982, 0.07051642, 0.024416761, 0.090431..."
2,0,"[0.10385486, 0.072066605, 0.021343006, 0.08585..."
3,0,"[0.10086881, 0.07678322, 0.023888903, 0.088769..."
4,1,"[0.09223259, 0.0710651, 0.022807341, 0.0932719..."
...,...,...
69293,1,"[0.088141896, 0.07901621, 0.013374326, 0.09493..."
69294,1,"[0.0893523, 0.07542462, 0.02782615, 0.10500157..."
69295,1,"[0.087977156, 0.08304228, 0.027856376, 0.08149..."
69296,0,"[0.059253834, 0.08475193, 0.0045307684, 0.0752..."


In [8]:
df["article_embedding"] = df["article_embedding"].apply(
    lambda x: x.tolist() if isinstance(x, np.ndarray) else x
)

In [9]:
df.to_csv(
    "embedded_datasets/FastText/FastText_Embedded_Dataset_Spacy.csv",
    encoding="utf-8",
    index=False,
)

# Generalization dataset


In [30]:
df = pd.read_csv("Word_Tokenized_GENERALIZATION_DATASET.csv")
df["tokenized_text"] = df["tokenized_text"].apply(ast.literal_eval)

In [31]:
df["article_embedding"] = df["tokenized_text"].apply(
    get_article_embedding, model=fasttext_vectors
)

df = df.dropna()

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [32]:
df = df[["label", "article_embedding"]]
df

Unnamed: 0,label,article_embedding
0,0,"[0.0009952491, -0.009249775, 0.088978045, 0.01..."
1,0,"[0.006638067, -0.027202059, 0.07487161, 0.0171..."
2,1,"[0.00058326195, -0.04074411, 0.07755698, 0.006..."
3,0,"[0.0059480257, -0.036587894, 0.037072897, 0.02..."
4,1,"[-0.00839588, -0.04971924, 0.08441708, 0.01357..."
...,...,...
5985,1,"[-0.00972143, -0.033943724, 0.061194275, 0.017..."
5986,0,"[-0.003399276, -0.019133002, 0.07433559, 0.015..."
5987,0,"[-0.0019355846, -0.022927374, 0.08191271, 0.01..."
5988,1,"[-0.0008060219, -0.024534158, 0.06894726, 0.01..."


In [33]:
df["article_embedding"] = df["article_embedding"].apply(
    lambda x: x.tolist() if isinstance(x, np.ndarray) else x
)

In [34]:
df.to_csv(
    "embedded_datasets/FastText/FastText_Embedded_Generalization_Dataset.csv",
    encoding="utf-8",
    index=False,
)