### This was run on colab

In [1]:
from google.colab import drive
import pandas as pd
import torch

In [2]:
from transformers import BertTokenizer, BertModel #kills kernel if run with others

In [3]:
# Mount Google Drive
drive.mount('/content/gdrive')

# Define the file path to the just cleaned df
# (I also put them here: https://drive.google.com/drive/folders/1lXVVxWaR-fJ4htHx8J5C2HtoPSiTM_BX if you want to bind there)
path = "/content/gdrive/MyDrive/AdvancedML/final_project_data/metadata_w_2020articles_cleaned.csv"

df = pd.read_csv(path,  index_col=0)

# TODO: also load the lemmatized data and see how that

Mounted at /content/gdrive


In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
test_article = df.loc[df.loc[:, 'uuid'] == 'bcbc6bb2-406e-11ee-a96e-33dec8f414a2', :]
test_text = test_article['title_text'][0]
test_text

'trump shifts tone, says he’s ‘proud’ of fed amid coronavirus turmoil president trump on monday said he was “proud” of federal reserve chairman jerome powell’s handling of the economic turmoil brought on by the coronavirus pandemic, backing down from more than a year of criticizing his hand-picked fed chief and threatening his job security.trump praised powell during a monday evening briefing at the white house after the fed that morning drastically expanded its efforts to protect the u.s. economy and financ'

In [None]:
# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
encoding = tokenizer.encode_plus(test_text, add_special_tokens = True,
                                 truncation = True, return_tensors = "pt")

# Print the tokens
print("Original Text:", test_text)
print("Tokenized Text:", encoding)

Original Text: trump shifts tone, says he’s ‘proud’ of fed amid coronavirus turmoil president trump on monday said he was “proud” of federal reserve chairman jerome powell’s handling of the economic turmoil brought on by the coronavirus pandemic, backing down from more than a year of criticizing his hand-picked fed chief and threatening his job security.trump praised powell during a monday evening briefing at the white house after the fed that morning drastically expanded its efforts to protect the u.s. economy and financ
Tokenized Text: {'input_ids': tensor([[  101,  8398, 12363,  4309,  1010,  2758,  2002,  1521,  1055,  1520,
          7098,  1521,  1997,  7349, 13463, 21887, 23350, 17930,  2343,  8398,
          2006,  6928,  2056,  2002,  2001,  1523,  7098,  1524,  1997,  2976,
          3914,  3472, 11120,  8997,  1521,  1055,  8304,  1997,  1996,  3171,
         17930,  2716,  2006,  2011,  1996, 21887, 23350,  6090,  3207,  7712,
          1010,  5150,  2091,  2013,  2062,  20

In [None]:
model = BertModel.from_pretrained('bert-base-uncased')
model.to(device)

def get_cls_sentence(sentence):
    # Tokenize input sentence and convert to tensor
    input_ids = torch.tensor([tokenizer.encode(sentence, add_special_tokens=True, max_length=512)]).to(device)

    # Pass input through BERT model and extract embeddings for [CLS] token
    with torch.no_grad():
        outputs = model(input_ids)
        cls_embedding = outputs[0][:, 0, :]

    return cls_embedding.flatten()

In [None]:
test_article['bert_sent_embedding'] = test_article['title_text'].apply(lambda sentence: get_cls_sentence(sentence))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_article['bert_sent_embedding'] = test_article['title_text'].apply(lambda sentence: get_cls_sentence(sentence))


In [None]:
# starting with title only embeddings
df['bert_sentence_embedding_title_only'] = df['title'].apply(lambda sentence: get_cls_sentence(sentence))

In [None]:
df.head(2)

Unnamed: 0,uuid,source,year,article_text,title,title_text,bert_sentence_embedding_title_only
0,bcbc6bb2-406e-11ee-a96e-33dec8f414a2,wp,2020,president trump on monday said he was “proud” ...,"trump shifts tone, says he’s ‘proud’ of fed am...","trump shifts tone, says he’s ‘proud’ of fed am...","[tensor(-0.0145, device='cuda:0'), tensor(0.29..."
1,cfb4ce29-406e-11ee-a96e-33dec8f414a2,wp,2020,u.s. intelligence reportedly indicates iran’s ...,"iran paid bounties for targeting us troops, in...","iran paid bounties for targeting us troops, in...","[tensor(-0.8398, device='cuda:0'), tensor(-0.0..."


In [None]:
# pickling so that we don't have to rerun the embedding process every time
df.to_pickle("/content/gdrive/MyDrive/AdvancedML/final_project_data/clean_2020articles_w_title_embeddings.pkl")

## KMeans Clustering

In [4]:
import numpy as np

from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score # TODO read about these scoring mechanisms
from sklearn.decomposition import PCA


In [5]:
# if no rerunning the embedding phase, read in df from pickel
df = pd.read_pickle("/content/gdrive/MyDrive/AdvancedML/final_project_data/clean_2020articles_w_title_embeddings.pkl")

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [None]:
# prep embeddings for modeling:
# cpu_df = df['bert_sentence_embedding_title_only'].cpu()
# X = np.vstack(df)

# cuda_tensor = torch.tensor(df['bert_sentence_embedding_title_only'])

# # Move the tensor column to CPU memory
# cpu_tensor = cuda_tensor.cpu()

# # Convert the CPU tensor column to NumPy array using np.vstack()
# X = np.vstack(cpu_tensor.detach().numpy())

In [None]:
def eval_cluster(embedding):
    y_pred = kmeans.fit_predict(embedding)

    # Evaluate the performance using ARI, NMI, and FMI
    ari = adjusted_rand_score(df["target"], y_pred)
    nmi = normalized_mutual_info_score(df["target"], y_pred)
    fmi = fowlkes_mallows_score(df["target"], y_pred)

    # Print Metrics scores
    print("Adjusted Rand Index (ARI): {:.3f}".format(ari))
    print("Normalized Mutual Information (NMI): {:.3f}".format(nmi))
    print("Fowlkes-Mallows Index (FMI): {:.3f}".format(fmi))

In [None]:
def dimension_reduction(embedding, method):

    pca = PCA(n_components=2, random_state=42)

    pca_vecs = pca.fit_transform(embedding)

    # save our two dimensions into x0 and x1
    x0 = pca_vecs[:, 0]
    x1 = pca_vecs[:, 1]

    df[f'x0_{method}'] = x0
    df[f'x1_{method}'] = x1

In [None]:
def plot_pca(x0_name, x1_name, cluster_name, method):

    plt.figure(figsize=(12, 7))

    plt.title(f"TF-IDF + KMeans 20newsgroup clustering with {method}", fontdict={"fontsize": 18})
    plt.xlabel("X0", fontdict={"fontsize": 16})
    plt.ylabel("X1", fontdict={"fontsize": 16})

    sns.scatterplot(data=df, x=x0_name, y=x1_name, hue=cluster_name, palette="viridis")
    plt.show()

In [None]:
cpu_df = df['bert_sentence_embedding_title_only'].cpu()
# embedding = np.vstack(cpu_df['bert_sentence_embedding_title_only'])

AttributeError: 'DataFrame' object has no attribute 'detach'

In [None]:
method = 'bert_sentence_title_embedding'
# initialize kmeans with 3 centroids
kmeans = KMeans(n_clusters=3, random_state=42)


# fit the model
kmeans.fit(embedding)

# store cluster labels in a variable
clusters = kmeans.labels_

# Assign clusters to our dataframe
clusters_result_name = f'cluster_bert_embeddings'
df['bert_clusters'] = clusters

eval_cluster(embedding)

dimension_reduction(embedding, method)

plot_pca(f'x0_{method}', f'x1_{method}', cluster_name=clusters_result_name, method=method)

ValueError: only one element tensors can be converted to Python scalars