In [15]:
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import pandas as pd
import seaborn as sn
import plotly.express as px

In [3]:
data = pd.read_csv("data/liar_all.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,statementID,label,statement,subjects,speaker,title,state,affiliation,barely_true,false,half_true,mostly_true,pants_on_fire,context,subset
0,0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,train
1,1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,train
2,2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,train
3,3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,train
4,4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,train


In [4]:
flat_embeddings = np.load("data/liar_embeddings.npy")
num_cols = int(flat_embeddings.size / len(data))
sentence_embeddings = flat_embeddings.reshape(len(data), int(num_cols))
sentence_embeddings.shape

(11507, 384)

In [18]:
tsne = TSNE(n_components=2, verbose=1, metric='cosine')
data_embedded = tsne.fit_transform(sentence_embeddings)


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 11507 samples in 0.016s...
[t-SNE] Computed neighbors for 11507 samples in 9.941s...
[t-SNE] Computed conditional probabilities for sample 1000 / 11507
[t-SNE] Computed conditional probabilities for sample 2000 / 11507
[t-SNE] Computed conditional probabilities for sample 3000 / 11507
[t-SNE] Computed conditional probabilities for sample 4000 / 11507
[t-SNE] Computed conditional probabilities for sample 5000 / 11507
[t-SNE] Computed conditional probabilities for sample 6000 / 11507
[t-SNE] Computed conditional probabilities for sample 7000 / 11507
[t-SNE] Computed conditional probabilities for sample 8000 / 11507
[t-SNE] Computed conditional probabilities for sample 9000 / 11507
[t-SNE] Computed conditional probabilities for sample 10000 / 11507
[t-SNE] Computed conditional probabilities for sample 11000 / 11507
[t-SNE] Computed conditional probabilities for sample 11507 / 11507
[t-SNE] Mean sigma: 0.199375
[t-SNE] KL divergence

In [19]:
df_tsne = pd.DataFrame()
df_tsne["label"] = data.label
df_tsne["dimension1"] = data_embedded[:, 0]
df_tsne["dimension2"] = data_embedded[:, 1]

In [20]:
df_tsne

Unnamed: 0,label,dimension1,dimension2
0,false,-11.004107,-43.589329
1,half-true,47.077896,52.223660
2,mostly-true,55.846539,8.315331
3,false,-21.848574,-37.472401
4,half-true,-14.286948,77.675827
...,...,...,...
11502,half-true,-67.312508,8.926132
11503,barely-true,3.530910,-29.527868
11504,barely-true,65.839058,-2.080461
11505,barely-true,58.326298,45.442997


In [21]:
fig = px.scatter(df_tsne, x="dimension1", y="dimension2", color="label")
fig.show()

In [22]:
pca = PCA(n_components=2)
pca.fit(sentence_embeddings)
data_pca = pca.transform(sentence_embeddings)

In [23]:
df_PCA = pd.DataFrame()
df_PCA["label"] = data.label
df_PCA["dimension1"] = data_pca[:, 0]
df_PCA["dimension2"] = data_pca[:, 1]

In [24]:
fig = px.scatter(df_PCA, x="dimension1", y="dimension2", color="label")
fig.show()