In [None]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
%matplotlib inline

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-cluster -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q pyg-lib -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git


import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import glob
from torch_geometric.nn import Node2Vec
from google.colab import drive
from sklearn.manifold import TSNE
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


drive.mount('/content/drive')

2.5.1+cu121
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
path_edges = "/content/drive/My Drive/UChicago/Tesis/node2vec/edges_node2vec_tfidf10/"
# Use glob to find all the part files
path_edges = glob.glob(path_edges + "part-*.csv")

nodes_path = '/content/drive/My Drive/UChicago/Tesis/node2vec/nodes_node2vec_tfidf10.csv'

Create graph from data, following Pytorch geometric documentation: https://pytorch-geometric.readthedocs.io/en/latest/tutorial/load_csv.html

First, load nodes into pytorch objects

In [None]:
#Function from documentation
def load_node_parquet(path, index_col, encoders=None, **kwargs):
    df = pd.read_csv(path, index_col=index_col, **kwargs)
    mapping = {index: i for i, index in enumerate(df.index.unique())}

    x = None
    if encoders is not None:
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim=-1)

    return x, mapping

#Class from documentation
class IdentityEncoder:
    def __init__(self, dtype=None):
        self.dtype = dtype

    def __call__(self, df):
        return torch.from_numpy(df.values).view(-1, 1).to(self.dtype)

words_x, words_mapping = load_node_parquet(nodes_path,
                                           index_col='node',
                                           encoders={'emo_pos': IdentityEncoder(dtype=torch.long),
                                                     'emo_anx': IdentityEncoder(dtype=torch.long),
                                                     'emo_sad': IdentityEncoder(dtype=torch.long),
                                                     'emo_anger': IdentityEncoder(dtype=torch.long),
                                                     'moral': IdentityEncoder(dtype=torch.long)})

#Function from documentation, modified to read multiple csv
def load_edge_csv(src_index_col, src_mapping, dst_index_col, dst_mapping,
                  encoders=None, **kwargs):
    df = pd.concat((pd.read_csv(f, **kwargs) for f in path_edges))

    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_index = torch.tensor([src, dst])

    edge_attr = None
    if encoders is not None:
        edge_attrs = [encoder(df[col]) for col, encoder in encoders.items()]
        edge_attr = torch.cat(edge_attrs, dim=-1)

    return edge_index, edge_attr


edge_index, edge_label = load_edge_csv(
    src_index_col='node1_norm',
    src_mapping=words_mapping,
    dst_index_col='node2_norm',
    dst_mapping=words_mapping,
    encoders={'weight': IdentityEncoder(dtype=torch.long)},
)

from torch_geometric.utils import to_undirected

undirected_edge_index = to_undirected(edge_index)

from torch_geometric.data import Data

data = Data(x=words_x, edge_index=undirected_edge_index)

## Train Node2Vec model

In [None]:
#Function from documentation
def train():
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [None]:
model = Node2Vec(
    data.edge_index,
    embedding_dim=128,
    walk_length=20,
    context_size=10,
    walks_per_node=10,
    num_negative_samples=1,
    p=1.0,
    q=1.0,
    sparse=True,
).to(device)

loader = model.loader(batch_size=128, shuffle=True)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

for epoch in range(1, 115):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')


Epoch: 001, Loss: 9.6302
Epoch: 002, Loss: 8.7561
Epoch: 003, Loss: 7.9589
Epoch: 004, Loss: 7.2244
Epoch: 005, Loss: 6.5272
Epoch: 006, Loss: 5.9150
Epoch: 007, Loss: 5.3974
Epoch: 008, Loss: 4.9157
Epoch: 009, Loss: 4.4864
Epoch: 010, Loss: 4.1276
Epoch: 011, Loss: 3.7837
Epoch: 012, Loss: 3.4934
Epoch: 013, Loss: 3.2496
Epoch: 014, Loss: 3.0248
Epoch: 015, Loss: 2.8203
Epoch: 016, Loss: 2.6461
Epoch: 017, Loss: 2.4930
Epoch: 018, Loss: 2.3617
Epoch: 019, Loss: 2.2431
Epoch: 020, Loss: 2.1367
Epoch: 021, Loss: 2.0453
Epoch: 022, Loss: 1.9705
Epoch: 023, Loss: 1.8985
Epoch: 024, Loss: 1.8364
Epoch: 025, Loss: 1.7798
Epoch: 026, Loss: 1.7331
Epoch: 027, Loss: 1.6920
Epoch: 028, Loss: 1.6531
Epoch: 029, Loss: 1.6233
Epoch: 030, Loss: 1.5943
Epoch: 031, Loss: 1.5715
Epoch: 032, Loss: 1.5500
Epoch: 033, Loss: 1.5320
Epoch: 034, Loss: 1.5146
Epoch: 035, Loss: 1.5001
Epoch: 036, Loss: 1.4887
Epoch: 037, Loss: 1.4780
Epoch: 038, Loss: 1.4679
Epoch: 039, Loss: 1.4589
Epoch: 040, Loss: 1.4511


In [None]:
# Get embeddings from the trained model
embeddings = model().detach().cpu().numpy()

In [None]:
#Apply dimensionality reduction before clustering
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(embeddings)

In [None]:
for k in range(2, 20):
    kmeans = KMeans(n_clusters=k, random_state=42).fit(tsne_result)
    silhouette_avg = silhouette_score(tsne_result, kmeans.labels_)
    print(f"Silhouette Score: {silhouette_avg:.4f}")

kmeans = KMeans(n_clusters=2, random_state=42).fit(tsne_result)
cluster_labels = kmeans.labels_

Silhouette Score: 0.4637
Silhouette Score: 0.3757
Silhouette Score: 0.3459
Silhouette Score: 0.3455
Silhouette Score: 0.3147
Silhouette Score: 0.3316
Silhouette Score: 0.3354
Silhouette Score: 0.3232
Silhouette Score: 0.3482
Silhouette Score: 0.3291
Silhouette Score: 0.3336
Silhouette Score: 0.3235
Silhouette Score: 0.3244
Silhouette Score: 0.3308
Silhouette Score: 0.3320
Silhouette Score: 0.3281
Silhouette Score: 0.3257
Silhouette Score: 0.3306


In [None]:
node_df = pd.read_csv(nodes_path)
node_df['cluster'] = cluster_labels

clusters = node_df.groupby('cluster')['node'].apply(list)

# Display nodes in each cluster
for cluster_id, nodes in clusters.items():
    print(f"Cluster {cluster_id}:")
    print(nodes)
    print()

Cluster 0:
['travel', 'hope', 'slow', 'ready', 'graduate', 'worried', 'instagram', 'crazy', 'dopamine', 'ahead', 'highly', 'extra', 'affect', 'anyways', 'tend', 'childhood', 'appreciate', 'shame', 'field', 'promise', 'account', 'somehow', 'regardless', 'apps', 'constant', 'hopefully', 'rid', 'ruin', 'luck', 'shape', 'serious', 'insecure', 'wife', 'touch', 'consistently', 'option', 'horrible', 'solve', 'honestly', 'unhealthy', 'kinda', 'ex', 'workout', 'stick', 'suck', 'exact', 'lol', 'gain', 'forget', 'fine', 'none', 'totally', 'bro', 'op', 'suggest', 'respond', 'stay', 'lost', 'skin', 'education', 'ignore', 'recently', 'girlfriend', 'balance', 'boundaries', 'main', 'mention', 'per', 'soon', 'please', 'safe', 'hear', 'program', 'boring', 'sick', 'average', 'friendships', 'mood', 'yeah', 'major', 'tv', 'anymore', 'heres', 'bear', 'count', 'dude', 'personally', 'content', 'mine', 'doubt', 'handle', 'amazing', 'influence', 'setting', 'super', 'relationship', 'sad', 'plus', 'pressure', 'pa

Cluster 0:
['travel', 'hope', 'slow', 'ready', 'graduate', 'worried', 'instagram', 'crazy', 'dopamine', 'ahead', 'highly', 'extra', 'affect', 'anyways', 'tend', 'childhood', 'appreciate', 'shame', 'field', 'promise', 'account', 'somehow', 'regardless', 'apps', 'constant', 'hopefully', 'rid', 'ruin', 'luck', 'shape', 'serious', 'insecure', 'wife', 'touch', 'consistently', 'option', 'horrible', 'solve', 'honestly', 'unhealthy', 'kinda', 'ex', 'workout', 'stick', 'suck', 'exact', 'lol', 'gain', 'forget', 'fine', 'none', 'totally', 'bro', 'op', 'suggest', 'respond', 'stay', 'lost', 'skin', 'education', 'ignore', 'recently', 'girlfriend', 'balance', 'boundaries', 'main', 'mention', 'per', 'soon', 'please', 'safe', 'hear', 'program', 'boring', 'sick', 'average', 'friendships', 'mood', 'yeah', 'major', 'tv', 'anymore', 'heres', 'bear', 'count', 'dude', 'personally', 'content', 'mine', 'doubt', 'handle', 'amazing', 'influence', 'setting', 'super', 'relationship', 'sad', 'plus', 'pressure', 'passion', 'uncomfortable', 'responsibility', 'head', 'hair', 'therapy', 'mess', 'whenever', 'upset', 'edit', 'fun', 'contact', 'physically', 'resource', 'youtube', 'somewhere', 'immediately', 'deserve', 'useful', 'general', 'actively', 'journaling', 'definitely', 'otherwise', 'nature', 'active', 'wait', 'forever', 'cycle', 'admit', 'excuse', 'certainly', 'clothes', 'mentally', 'circumstance', 'depend', 'describe', 'relate', 'depressed', 'train', 'remove', 'remind', 'boyfriend', 'proud', 'completely', 'adult', 'ass', 'obviously', 'far', 'catch', 'history', 'comment', 'app', 'waking', 'regret', 'tough', 'wish', 'miserable', 'offer', 'necessary', 'clearly', 'dark', 'basic', 'fulfil', 'wonder', 'public', 'student', 'truth', 'difference', 'bother', 'slowly', 'joke', 'whole', 'absolutely', 'wake', 'quickly', 'overall', 'zone', 'possibly', 'hobby', 'surround', 'actual', 'peace', 'anxious', 'similar', 'entire', 'blame', 'relax', 'moving', 'country', 'switch', 'badly', 'win', 'helpful', 'necessarily', 'reply', 'return', 'jump', 'sort', 'save', 'young', 'sorry', 'ugly', 'reflect', 'send', 'explore', 'dream', 'caring', 'therapist', 'decent', 'encourage', 'missing', 'impossible', 'broke', 'fill', 'period', 'hang', 'attractive', 'loss', 'cry', 'terrible', 'picture', 'male', 'sound', 'shower', 'fake', 'generally', 'exercising', 'acknowledge', 'naturally', 'genuinely', 'random', 'suggestion', 'shitty', 'position', 'finish', 'walk', 'esteem', 'middle', 'weird', 'continue', 'trouble', 'search', 'calm', 'club', 'several', 'lift', 'feed', 'computer', 'delete', 'unless', 'consistent', 'diet', 'connect', 'professional', 'quick', 'waiting', 'carry', 'sign', 'connection', 'regular', 'walking', 'putting', 'incredibly', 'awesome', 'fully', 'direction', 'idk', 'nobody', 'deep', 'mentioned', 'currently', 'agree', 'prove', 'abuse', 'marry', 'fitness', 'standard', 'attitude', 'text', 'beat', 'smile', 'discover', 'regularly', 'brother', 'especially', 'ton', 'define', 'laugh', 'fit', 'repeat', 'suppose', 'busy', 'although', 'addicted', 'struggled', 'half', 'barely', 'recommend', 'meaningful', 'hobbies', 'anyway', 'line', 'open', 'emotionally', 'ultimately', 'lonely', 'alcohol', 'kill', 'funny', 'meditate', 'hold', 'doctor', 'awkward', 'multiple', 'pretty', 'unfortunately', 'experienced', 'assume', 'socially', 'means', 'cool', 'version', 'google', 'aspect', 'guess', 'pull', 'seriously', 'straight', 'damn', 'weed', 'related', 'glad', 'lately', 'plenty', 'bunch']

Cluster 1:
['online', 'recognize', 'inner', 'often', 'conversation', 'productive', 'include', 'growth', 'explain', 'achieve', 'watch', 'perspective', 'character', 'trauma', 'leave', 'grow', 'sense', 'basically', 'happen', 'space', 'whether', 'two', 'lazy', 'present', 'lack', 'anyone', 'hit', 'purpose', 'college', 'act', 'understanding', 'grateful', 'university', 'always', 'set', 'progress', 'toxic', 'buy', 'perfect', 'joy', 'healthy', 'fact', 'name', 'low', 'certain', 'show', 'fat', 'accomplish', 'extremely', 'sure', 'process', 'lose', 'partner', 'pattern', 'capable', 'hand', 'phone', 'attract', 'power', 'success', 'everything', 'journal', 'couple', 'exactly', 'girl', 'exercise', 'man', 'afraid', 'choice', 'back', 'sit', 'opportunity', 'community', 'allow', 'limit', 'research', 'add', 'eat', 'deal', 'lesson', 'face', 'else', 'grade', 'apply', 'struggle', 'level', 'stand', 'bring', 'rather', 'stress', 'motivated', 'friend', 'develop', 'system', 'difficult', 'top', 'guide', 'porn', 'body', 'desire', 'money', 'example', 'worth', 'kind', 'reach', 'response', 'fix', 'accept', 'impact', 'maintain', 'actually', 'speak', 'attempt', 'area', 'study', 'physical', 'become', 'normal', 'tell', 'human', 'read', 'task', 'say', 'group', 'food', 'angry', 'energy', 'method', 'saying', 'quite', 'woman', 'future', 'cause', 'already', 'treat', 'story', 'specific', 'light', 'shit', 'course', 'drug', 'great', 'left', 'lifestyle', 'instead', 'fast', 'eye', 'result', 'create', 'answer', 'parent', 'waste', 'throw', 'decide', 'path', 'reason', 'honest', 'water', 'hate', 'interest', 'source', 'job', 'motivation', 'close', 'school', 'project', 'realize', 'miss', 'sport', 'notice', 'natural', 'family', 'free', 'pain', 'however', 'gym', 'mother', 'figure', 'successful', 'routine', 'perhaps', 'wrong', 'engage', 'late', 'aware', 'practice', 'enough', 'suffer', 'check', 'prepare', 'move', 'mistakes', 'end', 'remember', 'turn', 'trust', 'emotion', 'enjoy', 'provide', 'spend', 'confident', 'real', 'expect', 'party', 'live', 'view', 'front', 'track', 'room', 'usually', 'challenge', 'support', 'approach', 'whatever', 'city', 'able', 'probably', 'full', 'willing', 'personality', 'music', 'media', 'possible', 'social', 'break', 'respect', 'reduce', 'teach', 'drive', 'failure', 'poor', 'long', 'forward', 'thought', 'movie', 'second', 'choose', 'sometimes', 'rest', 'journey', 'receive', 'hurt', 'anger', 'avoid', 'note', 'current', 'expectation', 'addiction', 'activity', 'find', 'due', 'tool', 'finally', 'discipline', 'date', 'together', 'strong', 'consider', 'fear', 'write', 'comfort', 'ask', 'meal', 'see', 'health', 'high', 'internet', 'age', 'language', 'writing', 'confidence', 'order', 'quit', 'depression', 'word', 'simply', 'kid', 'potential', 'cold', 'skill', 'happiness', 'stupid', 'improve', 'attention', 'plan', 'piece', 'fight', 'clear', 'talk', 'degree', 'never', 'video', 'effort', 'type', 'follow', 'scared', 'list', 'succeed', 'die', 'fall', 'single', 'lie', 'fail', 'voice', 'decided', 'schedule', 'class', 'behavior', 'child', 'force', 'build', 'three', 'god', 'gone', 'meditation', 'need', 'important', 'away', 'pursue', 'amount', 'training', 'guy', 'look', 'personal', 'mindset', 'meet', 'part', 'clean', 'benefit', 'eventually', 'fault', 'moment', 'happy', 'case', 'rule', 'made', 'exist', 'constantly', 'call', 'okay', 'solution', 'state', 'coffee', 'literally', 'dad', 'judge', 'ability', 'house', 'bed', 'common', 'key', 'concept', 'mom', 'lead', 'development', 'strength', 'least', 'try', 'heart', 'easy', 'sleep', 'cut', 'comfortable', 'feeling', 'chance', 'worry', 'increase', 'tip', 'understand', 'opinion', 'run', 'come', 'short', 'home', 'interesting', 'career', 'easily', 'company', 'listen', 'begin', 'alone', 'identify', 'value', 'working', 'mental', 'smart', 'nice', 'seek', 'event', 'car', 'early', 'drink', 'either', 'pick', 'play', 'emotional', 'base', 'business', 'game', 'thinking', 'learn', 'overcome', 'knowledgeable', 'drop', 'push', 'muscle', 'information', 'meaning', 'another', 'though', 'simple', 'sex', 'message', 'side', 'environment', 'wear', 'tired', 'large', 'give', 'imagine', 'number', 'first', 'truly', 'mind', 'interested', 'society', 'belief', 'believe', 'require', 'test', 'reality', 'pay', 'term', 'true', 'seem', 'memory', 'weight', 'butt', 'last', 'place', 'individual', 'little', 'complete', 'pass', 'book', 'quality', 'brain', 'decision', 'effect', 'share', 'risk', 'ever', 'keep', 'manage', 'eating', 'anxiety', 'less', 'stop', 'making', 'almost', 'likely', 'big', 'smoke', 'beautiful', 'huge', 'fuck', 'compare', 'living']