# **Data Load**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Read the dataset from the specified path
df = pd.read_csv('/content/preprocess_text_Spacy_Nltk.csv', sep=',', encoding='utf-8', quotechar='"')


In [None]:
df.isnull().sum()

text              0
type              0
processed_Text    0
dtype: int64

# **Vector model**

# **1. Bag-of-Words (BoW) with scikit-learn's** Batch Processing

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
count_vectorizer = CountVectorizer()

# Define batch size
batch_size = 500

# Get the number of batches
num_batches = (len(df) + batch_size - 1) // batch_size

# Initialize an empty list to store BoW vectors
bow_vectors = []

# Process data in batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df))
    batch_texts = df['processed_Text'][start_idx:end_idx]

    # Fit the vectorizer to the batch of processed_Text data and transform it to obtain BoW vectors
    bow_matrix = count_vectorizer.fit_transform(batch_texts)

    # Convert the BoW matrix to an array and append to the list of BoW vectors
    bow_vectors.extend(bow_matrix.toarray().tolist())

# Store the BoW vectors in the 'vector' column of the DataFrame
df['vector'] = bow_vectors

# Display the DataFrame after adding the vector column
df.head()

Unnamed: 0,text,type,processed_Text,vector
0,WASHINGTON (Reuters) - The head of a conservat...,True,washington reuter head conservative republic...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,WASHINGTON (Reuters) - Transgender people will...,True,washington reuters transgender people allow ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,WASHINGTON (Reuters) - The special counsel inv...,True,washington reuter special counsel investigat...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,WASHINGTON (Reuters) - Trump campaign adviser ...,True,washington reuters trump campaign adviser ge...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,True,seattlewashington reuters president donald t...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
df.isnull().sum()

text              0
type              0
processed_Text    0
vector            0
dtype: int64

# **2. Bag-of-Words (BoW) using Gensim:**

In [None]:
from gensim import corpora, matutils
import numpy as np

# Define batch size
batch_size = 100

# Get the number of batches
num_batches = (len(df) + batch_size - 1) // batch_size

# Initialize an empty list to store 1D vectors
one_d_vectors = []

# Process data in batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df))
    batch_texts = df['processed_Text'][start_idx:end_idx]

    # Create a list of tokenized documents for the batch
    tokenized_documents = [doc.split() for doc in batch_texts]

    # Create a dictionary mapping words to IDs
    dictionary = corpora.Dictionary(tokenized_documents)

    # Create a Bag-of-Words (BoW) representation for each document in the batch
    bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]

    # Convert the BoW corpus to a dense matrix representation
    dense_matrix = matutils.corpus2dense(bow_corpus, num_terms=len(dictionary)).T

    # Concatenate the word counts of each document in the batch into a single 1D array
    batch_one_d_vectors = dense_matrix.sum(axis=1)

    # Append the 1D vectors of the batch to the list of vectors
    one_d_vectors.extend(batch_one_d_vectors)

# Store the 1D vectors in the 'vector' column of the DataFrame
df['vector'] = one_d_vectors

# Display the DataFrame with the 1D vector representations
df.head()

# **3. Gensim's Word2Vec:**

In [None]:
!pip install gensim
import gensim.downloader as api

# Load the pre-trained Word2Vec model
word2vec_model = api.load("word2vec-google-news-300")



^C


KeyboardInterrupt: 

In [None]:
# Function to convert text to vector representation
def text_to_vector(text):
    vector = []
    for word in text.split():
        if word in word2vec_model:
            vector.append(word2vec_model[word])
    return vector

# Apply the function to the processed_text column
df['vector'] = df['processed_Text'].apply(text_to_vector)

# Display the DataFrame after adding the vector column
df.head()

NameError: name 'word2vec_model' is not defined

# **4. spaCy's Word2Vec model**

In [None]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


**Batch Processing** 47 mins

In [None]:
import spacy

# Load spaCy model with pre-trained word vectors
#medium-sized English model of spaCy
nlp = spacy.load("en_core_web_md")

# Function to obtain Word2Vec embeddings for text
def get_word2vec_embeddings_batch(texts):
    text_vectors = []
    for text in texts:
        tokens = nlp(text)
        word_vectors = [token.vector for token in tokens if not token.is_punct and not token.is_space]
        if word_vectors:
            text_vector = sum(word_vectors) / len(word_vectors)
        else:
            text_vector = None
        text_vectors.append(text_vector)
    return text_vectors

# Apply the function to the processed_Text column in batches
batch_size = 100  # Adjust the batch size as needed
num_batches = (len(df) + batch_size - 1) // batch_size
vectors = []
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df))
    batch_texts = df['processed_Text'][start_idx:end_idx]
    batch_vectors = get_word2vec_embeddings_batch(batch_texts)
    vectors.extend(batch_vectors)

# Store the resulting vectors in the 'vector' column of the DataFrame
df['vector'] = vectors

# Display the DataFrame after adding the vector column
df.head()

KeyboardInterrupt: 

# **5. BERT Embeddings with Hugging Face Transformers**

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to obtain BERT embeddings for text
def get_bert_embeddings(text):
    # Tokenize input text
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

    # Forward pass through BERT model
    with torch.no_grad():
        outputs = model(**tokens)

    # Extract embeddings from BERT's output
    embeddings = torch.mean(outputs.last_hidden_state, dim=1).squeeze()

    return embeddings.numpy()

# Apply the function to the processed_Text column
df['vector'] = df['processed_Text'].apply(get_bert_embeddings)

# Display the DataFrame after adding the vector column
print(df.head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# **6. Topic Models with Gensim's LdaModel**

In [None]:
from gensim.models import LdaModel
from gensim import corpora

# Create a Gensim dictionary mapping each word to a unique integer ID
dictionary = corpora.Dictionary(df['processed_Text'].apply(lambda x: x.split()))

# Create a bag-of-words corpus
bow_corpus = [dictionary.doc2bow(doc.split()) for doc in df['processed_Text']]

# Train an LDA model on the corpus with the desired number of topics
num_topics = 10  # Adjust the number of topics as needed
lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary)

# Function to infer topic distribution for each document and return as vector representation
def infer_lda_vector(text):
    bow_vector = dictionary.doc2bow(text.split())
    lda_vector = lda_model[bow_vector]
    return lda_vector

# Apply the function to the processed_Text column
df['vector'] = df['processed_Text'].apply(infer_lda_vector)

# Display the DataFrame after adding the vector column
df.head()

Unnamed: 0,text,type,processed_Text,vector
0,WASHINGTON (Reuters) - The head of a conservat...,True,washington reuter head conservative republic...,"[(3, 0.10724926), (6, 0.033457294), (7, 0.0333..."
1,WASHINGTON (Reuters) - Transgender people will...,True,washington reuters transgender people allow ...,"[(2, 0.079763174), (4, 0.6116225), (6, 0.23486..."
2,WASHINGTON (Reuters) - The special counsel inv...,True,washington reuter special counsel investigat...,"[(0, 0.830361), (2, 0.02960292), (7, 0.0119955..."
3,WASHINGTON (Reuters) - Trump campaign adviser ...,True,washington reuters trump campaign adviser ge...,"[(0, 0.8336708), (2, 0.16247383)]"
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,True,seattlewashington reuters president donald t...,"[(3, 0.09799248), (5, 0.5902593), (6, 0.133344..."


In [None]:
# Check the type of the 'vector' column
print(df['vector'].dtype)

object


In [None]:
unique_types = df['vector'].apply(type).unique()
print(unique_types)

[<class 'list'>]


In [None]:
# Convert the 'vector' column from list of tuples to numpy array
df['vector'] = df['vector'].apply(lambda x: np.array(x))
# Flatten the 2D arrays into 1D arrays
df['vector'] = df['vector'].apply(lambda x: x.flatten())
df.head()

Unnamed: 0,text,type,processed_Text,vector
0,WASHINGTON (Reuters) - The head of a conservat...,True,washington reuter head conservative republic...,"[3.0, 0.10724925994873047, 6.0, 0.033457294106..."
1,WASHINGTON (Reuters) - Transgender people will...,True,washington reuters transgender people allow ...,"[2.0, 0.07976317405700684, 4.0, 0.611622512340..."
2,WASHINGTON (Reuters) - The special counsel inv...,True,washington reuter special counsel investigat...,"[0.0, 0.830361008644104, 2.0, 0.02960292063653..."
3,WASHINGTON (Reuters) - Trump campaign adviser ...,True,washington reuters trump campaign adviser ge...,"[0.0, 0.8336707949638367, 2.0, 0.1624738276004..."
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,True,seattlewashington reuters president donald t...,"[3.0, 0.09799247980117798, 5.0, 0.590259313583..."


# **7. TfidfVectorizer** Batch processing

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Define batch size
batch_size = 500

# Get the number of batches
num_batches = (len(df) + batch_size - 1) // batch_size

# Initialize an empty list to store TF-IDF vectors
tfidf_vectors = []

# Process data in batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df))
    batch_texts = df['processed_Text'][start_idx:end_idx]

    # Fit the vectorizer to the batch of processed_Text data and transform it to obtain TF-IDF vectors
    tfidf_matrix = tfidf_vectorizer.fit_transform(batch_texts)

    # Convert the TF-IDF matrix to an array and append to the list of TF-IDF vectors
    tfidf_vectors.extend(tfidf_matrix.toarray().tolist())

# Store the TF-IDF vectors in the 'vector' column of the DataFrame
df['vector'] = tfidf_vectors

# Display the DataFrame after adding the vector column
print(df.head())

# **8. Doc2Vec model provided by the Gensim** Batch processing

In [None]:
!pip install nltk



In [None]:
!pip install --upgrade nltk



In [None]:
import nltk
nltk.download('punkt')
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk

# Define a function to tag documents
def tag_docs(texts):
    tagged_docs = [TaggedDocument(words=word_tokenize(doc), tags=[i]) for i, doc in enumerate(texts)]
    return tagged_docs

# Define a function for batch inference
def infer_vector_batch(texts, model):
    vectors = []
    for text in texts:
        vectors.append(model.infer_vector(word_tokenize(text)))
    return vectors

# Initialize DataFrame and train Doc2Vec model
tagged_docs = tag_docs(df['processed_Text'])
doc2vec_model = Doc2Vec(vector_size=300, window=5, min_count=1, workers=4, epochs=20)
doc2vec_model.build_vocab(tagged_docs)
doc2vec_model.train(tagged_docs, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Define batch size for processing
batch_size = 100

# Apply batch processing for inferring vectors
vectors = []
num_batches = (len(df) + batch_size - 1) // batch_size
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df))
    batch_texts = df['processed_Text'][start_idx:end_idx]
    batch_vectors = infer_vector_batch(batch_texts, doc2vec_model)
    vectors.extend(batch_vectors)

# Store the resulting vectors in the 'vector' column of the DataFrame
df['vector'] = vectors

# Display the DataFrame after adding the vector column
df.head()


Unnamed: 0,text,type,processed_Text,vector
0,WASHINGTON (Reuters) - The head of a conservat...,True,washington reuter head conservative republic...,"[0.34631696, 0.9650738, -0.31735873, -0.636014..."
1,WASHINGTON (Reuters) - Transgender people will...,True,washington reuters transgender people allow ...,"[0.15196763, -0.9139972, -1.8225806, 0.957241,..."
2,WASHINGTON (Reuters) - The special counsel inv...,True,washington reuter special counsel investigat...,"[0.24473184, 0.21103281, -0.5109348, 0.4812038..."
3,WASHINGTON (Reuters) - Trump campaign adviser ...,True,washington reuters trump campaign adviser ge...,"[1.051458, 0.91770995, 0.68120617, 0.5949095, ..."
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,True,seattlewashington reuters president donald t...,"[-0.7704251, -0.035391483, 0.48849884, 0.76404..."


# **Create .csv**

In [None]:
# Save DataFrame to a CSV file
df.to_csv('Doc2Vec.csv', index=False)

NameError: name 'df' is not defined

In [None]:
# Create a download link for your file
from IPython.display import FileLink
FileLink('Doc2Vec.csv')