# Clustering and keywords

Her bruker jeg vektorrepresentasjonene (embeddings) av tekstene til å lage grupperinger (clusters) basert på innhold. I tillegg finner jeg nøkkelord fra hver gruppe slik at man enklere kan se hva tekstene i hver gruppe handler om, og se om det vi har funnet kan stemme. 

In [1]:
%pip install keybert yake -quiet

# Basic imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

# Clustering and keyword extraction imports
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from keybert import KeyBERT
from sklearn.decomposition import PCA 
import yake

# Special imports for plots
import matplotlib.patches as mpatches
import matplotlib.colors as mcolors
import plotly.express as px
import plotly.graph_objects as go



Usage:   
  /opt/anaconda3/envs/embed/bin/python -m pip install [options] <requirement specifier> [package-index-options] ...
  /opt/anaconda3/envs/embed/bin/python -m pip install [options] -r <requirements file> [package-index-options] ...
  /opt/anaconda3/envs/embed/bin/python -m pip install [options] [-e] <vcs project url> ...
  /opt/anaconda3/envs/embed/bin/python -m pip install [options] [-e] <local project path> ...
  /opt/anaconda3/envs/embed/bin/python -m pip install [options] <archive url/path> ...

no such option: -u
Note: you may need to restart the kernel to use updated packages.


  from tqdm.autonotebook import tqdm, trange


In [2]:
## Equinor news (2024-1998)

with open("data/Equinor_split_news_content_by_year.pkl","rb") as file:
    equinor_split_news = pickle.load(file)

with open("data/equinor_news_embeddings.pkl","rb") as file:
    equinor_news_embeddings = pickle.load(file)

In [3]:
# Clustering
def cluster(year,num_clusters):
    clustering_model = KMeans(n_clusters=num_clusters)
    clustering_model.fit(equinor_news_embeddings[year])
    cluster_assignment = clustering_model.labels_
    clusters = [[] for i in range(num_clusters)]
    for index, label in enumerate(cluster_assignment):
        clusters[label].append(equinor_split_news[year][index])
    print("Finished clustering")
    return clusters, cluster_assignment

In [4]:
# Find keywords
def find_keywords(num_words,clusters):
    kw_model = KeyBERT(model='paraphrase-MiniLM-L6-v2')
    keywords_list = []
    for i, cluster in enumerate(clusters):
        print(f"Cluster {i}:")
        combined_text = " ".join(cluster)
        keywords = kw_model.extract_keywords(combined_text, keyphrase_ngram_range=(1, num_words), stop_words='english', top_n=1)
        for kw in keywords:
            print(f"Keyword: {kw[0]}, Score: {kw[1]}")
            keywords_list.append([kw[0]])
        print("\n")
    return keywords_list

In [5]:
def plot_clusters(year, clusters, cluster_assignment, keywords):
    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(equinor_news_embeddings[year])
    
    # Filter out the embeddings that have been used for clustering
    valid_indices = [i for i in range(len(cluster_assignment)) if cluster_assignment[i] != -1]
    filtered_embeddings = reduced_embeddings[valid_indices]
    filtered_texts = [equinor_split_news[year][i] for i in valid_indices]
    filtered_assignments = [cluster_assignment[i] for i in valid_indices]
    
    df = pd.DataFrame(filtered_embeddings, columns=['PCA1', 'PCA2'])
    df['Cluster'] = filtered_assignments
    df['Text'] = filtered_texts

    # Create a color map for the clusters
    colors = ['red', 'blue', 'green', 'purple', 'orange', 'brown', 'pink', 'gray', 'cyan', 'magenta']
    color_map = {i: colors[i] for i in range(len(clusters))}

    # Assign colors to each cluster
    df['Color'] = df['Cluster'].apply(lambda x: color_map[x])

    # Create the scatter plot
    fig = go.Figure()

    for cluster_num in range(len(clusters)):
        cluster_data = df[df['Cluster'] == cluster_num]
        keyword = keywords[cluster_num][0] if keywords and keywords[cluster_num] else f'Cluster {cluster_num}'
        fig.add_trace(go.Scatter(
            x=cluster_data['PCA1'],
            y=cluster_data['PCA2'],
            mode='markers',
            marker=dict(color=color_map[cluster_num]),
            name=f'Cluster {cluster_num}: {keyword}',
            text=cluster_data['Text'],
            hoverinfo='text'
        ))

    # Update layout
    fig.update_layout(
        title=f'{year}: Text Clusters with Keywords',
        xaxis_title='PCA Component 1',
        yaxis_title='PCA Component 2',
        legend_title='Cluster Keywords',
        showlegend=True
    )

    fig.show()


## Equinor 2024

In [6]:
clusters_2024 = cluster("2024",6)[0]

cluster_assignment_2024 = cluster("2024",6)[1]

keywords_2024 = find_keywords(1,clusters_2024)


Finished clustering
Finished clustering


In [None]:
print(keywords_2024)

[['existing infrastructure norwegian'], ['equinor renewables business'], ['safety results equinor'], ['equinor norwegian state'], ['offshore wind farm'], ['nyc offshore wind']]


In [103]:
plot_clusters("2024",clusters_2024, cluster_assignment_2024,keywords_2024)

## Equinor 2020

In [7]:
clusters_2020 = cluster("2020",6)[0]

cluster_assignment_2020 = cluster("2020",6)[1]

keywords_2020 = find_keywords(3,clusters_2020)


In [105]:
plot_clusters("2020",clusters_2020, cluster_assignment_2020,keywords_2020)

## Equinor 2008

In [106]:
clusters_2008 = cluster("2008",6)[0]

cluster_assignment_2008 = cluster("2008",6)[1]

keywords_2008 = find_keywords(3,clusters_2008)


Cluster 0:
Keyword: statoilhydro net income, Score: 0.6733


Cluster 1:
Keyword: drill statoilhydro gulf, Score: 0.6251


Cluster 2:
Keyword: statoilhydro share saving, Score: 0.6643


Cluster 3:
Keyword: new cable gjøa, Score: 0.6587


Cluster 4:
Keyword: exploration ncs drilling, Score: 0.5467


Cluster 5:
Keyword: restructuring statoilhydro offshore, Score: 0.5842




In [108]:
plot_clusters("2008",clusters_2008, cluster_assignment_2008,keywords_2008)

## Equinor 1998

In [109]:
clusters_1998 = cluster("1998",6)[0]

cluster_assignment_1998 = cluster("1998",6)[1]

keywords_1998 = find_keywords(3,clusters_1998)


Cluster 0:
Keyword: azerbaijan ll drilling, Score: 0.569


Cluster 1:
Keyword: drilled statoil nigeria, Score: 0.7058


Cluster 2:
Keyword: drilling near norne, Score: 0.6418


Cluster 3:
Keyword: oil sales asia, Score: 0.7257


Cluster 4:
Keyword: estonia statoil subsidiary, Score: 0.7131


Cluster 5:
Keyword: appeal lodged statoil, Score: 0.5608




In [110]:
plot_clusters("1998",clusters_1998, cluster_assignment_1998,keywords_1998)

## HDBscan 2024

In [5]:
import hdbscan
import plotly.graph_objs as go

def HDB_clusters(year,min_size,limit):
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_size, min_samples=1)
    cluster_labels = clusterer.fit_predict(equinor_news_embeddings[year])
    unique_clusters = np.unique(cluster_labels)
    num_clusters = min(limit, len(unique_clusters))
    cluster_mapping = {old: new for new, old in enumerate(unique_clusters[:num_clusters])}
    cluster_labels = np.array([cluster_mapping.get(label, -1) for label in cluster_labels])
    clusters = [[] for _ in range(limit)]
    for index, label in enumerate(cluster_labels):
        if label != -1:  # Exclude noise points labeled as -1
            clusters[label].append(equinor_split_news["2024"][index]) ##
    return clusters, cluster_labels


In [6]:
# Keyword extraction with KeyBERT
def find_keywords(num_words, clusters):
    kw_model = KeyBERT(model='all-MiniLM-L6-v2')
    keywords_list = []
    for i, cluster in enumerate(clusters):
        print(f"Cluster {i}:")
        combined_text = " ".join(cluster)
        keywords = kw_model.extract_keywords(combined_text, keyphrase_ngram_range=(1, num_words), stop_words='english', top_n=1)
        for kw in keywords:
            print(f"Keyword: {kw[0]}, Score: {kw[1]}")
            keywords_list.append([kw[0]])
        print("\n")
    return keywords_list

In [7]:
HDB_cluster = HDB_clusters("2024",6,6)[0]

HDB_cluster_assignment = HDB_clusters("2024",6,6)[1]

keywords_2024 = find_keywords(3,HDB_cluster)

: 

: 

In [1]:
# PCA for dimensionality reduction
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(equinor_news_embeddings["2024"])

# Filter out the embeddings that have been used for clustering
valid_indices = [i for i in range(len(HDB_cluster_assignment)) if HDB_cluster_assignment[i] != -1]
filtered_embeddings = reduced_embeddings[valid_indices]
filtered_texts = [equinor_split_news["2024"][i] for i in valid_indices]
filtered_assignments = [HDB_cluster_assignment[i] for i in valid_indices]

df = pd.DataFrame(filtered_embeddings, columns=['PCA1', 'PCA2'])
df['Cluster'] = filtered_assignments
df['Text'] = filtered_texts

# Create a color map for the clusters
colors = ['red', 'blue', 'green', 'purple', 'orange', 'brown', 'pink', 'gray']
num_clusters = len(HDB_cluster)
color_map = {i: colors[i % len(colors)] for i in range(num_clusters)}

# Assign colors to each cluster
df['Color'] = df['Cluster'].apply(lambda x: color_map.get(x, 'black'))  # Assign 'black' for noise points

# Create the scatter plot
fig = go.Figure()

for cluster_num in range(num_clusters):
    cluster_data = df[df['Cluster'] == cluster_num]
    keyword = keywords_2024[cluster_num][0] if cluster_num < len(keywords_2024) else f'Cluster {cluster_num}'
    fig.add_trace(go.Scatter(
        x=cluster_data['PCA1'],
        y=cluster_data['PCA2'],
        mode='markers',
        marker=dict(color=color_map[cluster_num]),
        name=f'Cluster {cluster_num}: {keyword}',
        text=cluster_data['Text'],
        hoverinfo='text'
    ))

# Update layout
fig.update_layout(
    title='2024',
    xaxis_title='PCA Component 1',
    yaxis_title='PCA Component 2',
    legend_title='Cluster Keywords',
    showlegend=True
)

fig.show()

NameError: name 'PCA' is not defined