In [6]:
!pip install tensorflow==2.11.0 

Collecting keras<2.12,>=2.11.0 (from tensorflow==2.11.0)
  Using cached keras-2.11.0-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached keras-2.11.0-py2.py3-none-any.whl (1.7 MB)
Installing collected packages: keras
Successfully installed keras-2.11.0


In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from umap.parametric_umap import ParametricUMAP
from datasets import load_from_disk
from sklearn.preprocessing import MinMaxScaler


  warn("""The umap.parametric_umap package requires Keras >= 3 to be installed.""")


ImportError: umap.parametric_umap requires Keras

### Read the dataset

In [None]:
violence_hidden = load_from_disk("/data3/mmendieta/Violence_data/geo_corpus.0.0.1_datasets_hidden_small_labse")

In [None]:
violence_hidden

In [None]:
# pick a subset of the training set (1M observations)
train_ds = violence_hidden["train"].shuffle(seed=42).select(range(1000))

In [None]:
train_ds

### Creating a feature matrix

In [None]:
# This code took almost 20 min to run
%time X_train = np.array(train_ds["hidden_state"])

y_train_pre7_10 = np.array(train_ds["pre7geo10"])
y_train_pre7_30 = np.array(train_ds["pre7geo30"])
y_train_pre7_50 = np.array(train_ds["pre7geo50"])
y_train_post7_10 = np.array(train_ds["post7geo10"])
y_train_post7_30 = np.array(train_ds["post7geo30"])
y_train_post7_50 = np.array(train_ds["post7geo50"])

X_train.shape

### Visualize the training set

In [None]:
# Scale features to [0,1] range
X_scaled = MinMaxScaler().fit_transform(X_train)
# Initialize and fit UMAP
%time mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)


In [None]:
# Create a dataframe of 2D embeddings
df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
df_emb['pre7geo10'] = y_train_pre7_10
df_emb['pre7geo30'] = y_train_pre7_30
df_emb['pre7geo50'] = y_train_pre7_50
df_emb['post7geo10'] = y_train_post7_10
df_emb['post7geo30'] = y_train_post7_30
df_emb['post7geo50'] = y_train_post7_50
df_emb.head()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(5,3))
fig.suptitle('pre7geo10')
axes = axes.flatten()
cmaps = ["Blues", "Oranges"]
labels = [0,1]

for i, (label, cmap) in enumerate(zip(labels, cmaps)):
    df_emb_sub = df_emb.query(f"pre7geo10 == {i}")
    axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap,
                   gridsize=20, linewidths=(0,))
    axes[i].set_title(label)
    axes[i].set_xticks([]), axes[i].set_yticks([])

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(5,3))
fig.suptitle('pre7geo30')
axes = axes.flatten()
cmaps = ["Blues", "Oranges"]
labels = [0,1]

for i, (label, cmap) in enumerate(zip(labels, cmaps)):
    df_emb_sub = df_emb.query(f"pre7geo30 == {i}")
    axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap,
                   gridsize=20, linewidths=(0,))
    axes[i].set_title(label)
    axes[i].set_xticks([]), axes[i].set_yticks([])

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(5,3))
fig.suptitle('pre7geo50')
axes = axes.flatten()
cmaps = ["Blues", "Oranges"]
labels = [0,1]

for i, (label, cmap) in enumerate(zip(labels, cmaps)):
    df_emb_sub = df_emb.query(f"pre7geo50 == {i}")
    axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap,
                   gridsize=20, linewidths=(0,))
    axes[i].set_title(label)
    axes[i].set_xticks([]), axes[i].set_yticks([])

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(5,3))
fig.suptitle('post7geo10')
axes = axes.flatten()
cmaps = ["Greens", "Reds"]
labels = [0,1]

for i, (label, cmap) in enumerate(zip(labels, cmaps)):
    df_emb_sub = df_emb.query(f"post7geo10 == {i}")
    axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap,
                   gridsize=20, linewidths=(0,))
    axes[i].set_title(label)
    axes[i].set_xticks([]), axes[i].set_yticks([])

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(5,3))
fig.suptitle('post7geo30')
axes = axes.flatten()
cmaps = ["Greens", "Reds"]
labels = [0,1]

for i, (label, cmap) in enumerate(zip(labels, cmaps)):
    df_emb_sub = df_emb.query(f"post7geo30 == {i}")
    axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap,
                   gridsize=20, linewidths=(0,))
    axes[i].set_title(label)
    axes[i].set_xticks([]), axes[i].set_yticks([])

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(5,3))
fig.suptitle('post7geo50')
axes = axes.flatten()
cmaps = ["Greens", "Reds"]
labels = [0,1]

for i, (label, cmap) in enumerate(zip(labels, cmaps)):
    df_emb_sub = df_emb.query(f"post7geo50 == {i}")
    axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap,
                   gridsize=20, linewidths=(0,))
    axes[i].set_title(label)
    axes[i].set_xticks([]), axes[i].set_yticks([])

plt.tight_layout()
plt.show()

In [None]:
def reduced_dimensionality(train_ds, field_name, label_col='key'):
    print(f"Starting dimensionality reduction for: {field_name}")
    
    # Step 0: Convert to Numpy
    X = np.array(train_ds[field_name])
    y = np.