# Embedding space

This document will explore the training and analysis of our embedding space.

In [None]:
%pip install scipy==1.10.0
%pip install gensim
%pip install numpy
%pip install pandas
%pip install scikit-learn
%pip install matplotlib
%pip install pacmap

In [None]:
import csv
import pandas as pd
from gensim.test.utils import datapath
from gensim import utils
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import numpy as np
from matplotlib import pyplot as plt



In [None]:
with open('../Data/sequences.csv', 'r') as f:
    reader = csv.reader(f)
    sequences = list(reader)

# Model

In [None]:
model = Word2Vec(sentences=sequences,   # This is the data that we wish to create notes on. This will take all unique words (stations) and put them in the NN
                 vector_size=300,       # Amount of dimension
                 min_count=10,          # If the number of occurences of this station is less than 10, then we are not interested in having it in our embedding. -- THIS NEED TO BE LOOKED AT
                 workers=4              # Amount of cores used for training and so forth.
                 )         

model.build_vocab(sequences)
model.train(sequences, total_examples=model.corpus_count, epochs=1)


In [None]:
model.wv.vectors

# Vanløse test case

In [None]:

from collections import Counter

# Initialize a Counter to store station counts
station_counter = Counter()
list_of_stations_with_vanløse = []

# Iterate over each journey list
for journey in sequences:
    # Check if 'Vanløse St.' is present in the journey
    if 'Vanløse St.' in journey:
        # Iterate over each station in the journey
        for station in journey:
            # Exclude 'Vanløse St.' from counting
            if station != 'Vanløse St.':
                # Increment the count for the station
                station_counter[station] += 1

# Sort the station counts by count (descending order)
sorted_stations = sorted(station_counter.items(), key=lambda x: x[1], reverse=True)
for sta in sorted_stations:
    if sta[1] > 7:
        list_of_stations_with_vanløse.append(sta[0])

# Print the station counts
for station, count in sorted_stations:
    print(f"{station}: {count}")


In [None]:
list_of_stations_with_vanløse

In [None]:
from matplotlib import pyplot as plt


def tsne_plot(model):
    "Create TSNE model and plot it"
    labels = []
    tokens = []

    for word in model.wv.index_to_key:
        tokens.append(model.wv[word])
        if(word) in list_of_stations_with_vanløse:
            labels.append(word)
        else:
            labels.append(None)
       
    tsne_model = TSNE(perplexity=5, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(np.array(tokens))

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(18, 18)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        
        plt.annotate(labels[i],
                    xy=(x[i], y[i]),
                    xytext=(5, 2),
                    textcoords='offset points',
                    ha='right',
                    va='bottom')
    plt.show()
   
tsne_plot(model)

# København H (Metro) Test

In [None]:

from collections import Counter

# Initialize a Counter to store station counts
station_counter_kbh_metro = Counter()
list_of_stations_with_kbh_metro = []

# Iterate over each journey list
for journey in sequences:
    if 'København H (Metro)' in journey:
        # Iterate over each station in the journey
        for station in journey:
            if station != 'København H (Metro)':
                station_counter_kbh_metro[station] += 1

# Sort the station counts by count (descending order)
sorted_stations_kbh = sorted(station_counter_kbh_metro.items(), key=lambda x: x[1], reverse=True)
for sta in sorted_stations_kbh:
    if sta[1] > 7:
        list_of_stations_with_kbh_metro.append(sta[0])

# Print the station counts
for station, count in sorted_stations_kbh:
    print(f"{station}: {count}")


In [None]:
list_of_stations_with_kbh_metro

In [None]:
def tsne_plot(model):
    "Create TSNE model and plot it"
    labels = []
    tokens = []

    for word in model.wv.index_to_key:
        tokens.append(model.wv[word])
        if(word) in list_of_stations_with_kbh_metro:
            labels.append(word)
        else:
            labels.append(None)
       
    tsne_model = TSNE(perplexity=5, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(np.array(tokens))

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(18, 18)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        
        plt.annotate(labels[i],
                    xy=(x[i], y[i]),
                    xytext=(5, 2),
                    textcoords='offset points',
                    ha='right',
                    va='bottom')
    plt.show()
   
tsne_plot(model)

# General

Reference: ```https://towardsdatascience.com/visualizing-your-embeddings-4c79332581a9```
#### Embedding space

***Mathematical intuition***: Given two points Xi, Xj, the farther they are, the higher their distance dj|i, the higher their dissimilarity, and the lower the probability that they will consider each other neighbors.

***Key concept***: the further away two embeddings are in the space, the more dissimilar they are.

#### Perplexity
***Mathematical intuition***: The higher the perplexity, the more likely it is to consider points that are far away as neighbors.

***Advice***: The authors of SNE and t-SNE (yes, t-SNE has perplexity as well) use perplexity values between five and 50.

# More trained model

In [None]:
model_new = Word2Vec(sentences=sequences,   # This is the data that we wish to create notes on. This will take all unique words (stations) and put them in the NN
                 vector_size=300,       # Amount of dimension
                 min_count=10,          # If the number of occurences of this station is less than 10, then we are not interested in having it in our embedding. -- THIS NEED TO BE LOOKED AT
                 workers=4              # Amount of cores used for training and so forth.
                 )         

model_new.build_vocab(sequences)
model_new.train(sequences, total_examples=model.corpus_count, epochs=1000)

In [None]:
model_new.wv.vectors

In [None]:
def tsne_plot(model):
    "Create TSNE model and plot it"
    labels = []
    tokens = []

    for word in model_new.wv.index_to_key:
        tokens.append(model_new.wv[word])
        if(word) in list_of_stations_with_kbh_metro:
            labels.append(word)
        else:
            labels.append(None)
       
    tsne_model = TSNE(perplexity=50, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(np.array(tokens))

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(18, 18)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        
        plt.annotate(labels[i],
                    xy=(x[i], y[i]),
                    xytext=(5, 2),
                    textcoords='offset points',
                    ha='right',
                    va='bottom')
    plt.show()
   
tsne_plot(model_new)

# Testing using PaCMAP


In [None]:
%pip install pacmap

# Visualizing with model of all journeys cph trained 5000 times

In [None]:
model_all = Word2Vec.load("../Data/word2vec_epoch_5000.model")

## paCMAP

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import pacmap

# Assuming model_new is your Word2Vec model
station_names = model_all.wv.index_to_key

# Generate labels based on whether "metro" is in the station name
labels = []
for station_name in station_names:
    if "metro" in station_name.lower():
        labels.append("Metro")
    else:
        labels.append("Non-Metro")

# Convert labels to numerical format
label_encoder = LabelEncoder()
label_encoder.fit(labels)
numeric_labels = label_encoder.transform(labels)

# Convert Word2Vec vectors to numpy array
X = model_all.wv.vectors

# Initializing the PaCMAP instance
embedding = pacmap.PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0) 

# Fit the data
X_transformed = embedding.fit_transform(X, init="pca")

# Visualize the embedding with different colors for "Metro" and "Non-Metro" stations
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
scatter = ax.scatter(X_transformed[:, 0], X_transformed[:, 1], c=numeric_labels, cmap='coolwarm', s=0.6)
plt.colorbar(scatter, ax=ax)

plt.show()


## T-SNE

In [None]:
def tsne_plot(model):
    "Create TSNE model and plot it"
    labels = []
    tokens = []

    for word in model.wv.index_to_key:
        tokens.append(model.wv[word])
        #labels.append(word)
       
    tsne_model = TSNE(perplexity=50, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(np.array(tokens))

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(18, 18)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        
        # plt.annotate(labels[i],
        #             xy=(x[i], y[i]),
        #             xytext=(5, 2),
        #             textcoords='offset points',
        #             ha='right',
        #             va='bottom')
    plt.show()
   
tsne_plot(model_all)