# Okay now with ISEAR???

## 1. Loading ISEAR

In [2]:
import pandas as pd
import numpy as np

# Load ISEAR dataset
data = pd.read_csv('/home/pes1ug22am100/Documents/Research and Experimentation/Learning-Machine-Unlearning/eng_dataset.csv')
# Map emotions to binary labels (positive: 1, negative: 0)
positive_emotions = ["joy"]
negative_emotions = ["fear", "anger", "sadness", "disgust", "shame", "guilt"]

df = data.copy()
df["label"] = df["sentiment"].apply(lambda x: 1 if x in positive_emotions else 0)

In [3]:
data.head()

Unnamed: 0,ID,sentiment,content
0,10941,anger,At the point today where if someone says somet...
1,10942,anger,@CorningFootball IT'S GAME DAY!!!! T MIN...
2,10943,anger,This game has pissed me off more than any othe...
3,10944,anger,@spamvicious I've just found out it's Candice ...
4,10945,anger,@moocowward @mrsajhargreaves @Melly77 @GaryBar...


In [4]:
df.head()

Unnamed: 0,ID,sentiment,content,label
0,10941,anger,At the point today where if someone says somet...,0
1,10942,anger,@CorningFootball IT'S GAME DAY!!!! T MIN...,0
2,10943,anger,This game has pissed me off more than any othe...,0
3,10944,anger,@spamvicious I've just found out it's Candice ...,0
4,10945,anger,@moocowward @mrsajhargreaves @Melly77 @GaryBar...,0


In [5]:
print("\nLabel distribution:")
print(df["label"].value_counts())


Label distribution:
label
0    5486
1    1616
Name: count, dtype: int64


## 2. Preprocess Data

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["content"])
sequences = tokenizer.texts_to_sequences(df["content"])

# Pad sequences to a fixed length
max_sequence_length = 50
X = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Labels
y = np.array(df["label"])

# Print example preprocessed data
print("\nExample preprocessed sequences:")
print(X[:3])

2025-02-02 14:48:18.107130: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-02 14:48:18.118069: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738487898.129738   27815 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738487898.134248   27815 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-02 14:48:18.145283: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr


Example preprocessed sequences:
[[  23    1  438   89  190   33  113  369  140 8045  490    2   15    4
  8046   42  491   41    7   10  679    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]
 [4214   34  211   59  567 1822 1284  650  479    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]
 [  18  211   72 1182   15   99   67  116  146  212  211   18  275   10
   801    6  465   61    2  370   11   99 8047    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]


## 3. RNN time hehehe

In [7]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Define RNN model
def build_rnn_model(vocab_size, embedding_dim, max_sequence_length):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length),
        SimpleRNN(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Build model
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size
embedding_dim = 50
model = build_rnn_model(vocab_size, embedding_dim, max_sequence_length)

W0000 00:00:1738487900.021250   27815 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


## 4. Apply the SISA from another MISA

### 4.1 Shard the Dataset

In [8]:
num_shards = 5
shard_size = len(X) // num_shards

# Split dataset into shards
shards_X = [X[i * shard_size:(i + 1) * shard_size] for i in range(num_shards)]
shards_y = [y[i * shard_size:(i + 1) * shard_size] for i in range(num_shards)]

### 4.2 Train models on each shard

In [9]:
models = []
for i in range(num_shards):
    print(f"Training model on shard {i+1}")
    model = build_rnn_model(vocab_size, embedding_dim, max_sequence_length)
    model.fit(shards_X[i], shards_y[i], epochs=10, batch_size=32, verbose=0)
    models.append(model)

Training model on shard 1
Training model on shard 2
Training model on shard 3
Training model on shard 4
Training model on shard 5


## 5. Quick Eval before unlearning

In [10]:
# Create a test set
test_indices = np.random.choice(len(X), size=100, replace=False)
X_test = X[test_indices]
y_test = y[test_indices]

# Aggregate predictions from all shard models
def aggregate_predictions(models, X_test):
    predictions = np.zeros((X_test.shape[0], 1))
    for model in models:
        predictions += model.predict(X_test)
    return (predictions / len(models)) > 0.5  # Binary classification threshold

# Evaluate accuracy
y_pred_before = aggregate_predictions(models, X_test)
accuracy_before = np.mean(y_pred_before.flatten() == y_test)
print(f"Accuracy before unlearning: {accuracy_before * 100:.2f}%")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Accuracy before unlearning: 77.00%


## 6. Now we unlearn

### 6.1 Identofy shard

In [11]:
data_point_index = 42  # Example index of data point to unlearn
shard_index = data_point_index // shard_size

### 6.2 Retrain shard model

In [24]:
# Check if the data point was removed
print(f"Shard size before removal: {len(shards_X[shard_index])}")
new_shard_X = np.delete(shards_X[shard_index], data_point_index % shard_size, axis=0)
new_shard_y = np.delete(shards_y[shard_index], data_point_index % shard_size, axis=0)
print(f"Shard size after removal: {len(new_shard_X)}")

# Retrain the model on the updated shard
models[shard_index] = build_rnn_model(vocab_size, embedding_dim, max_sequence_length)
models[shard_index].fit(new_shard_X, new_shard_y, epochs=10, batch_size=32, verbose=0)

Shard size before removal: 1420
Shard size after removal: 1419


<keras.src.callbacks.history.History at 0x79fa08d9df90>

In [25]:
# Get the model's prediction for the unlearned data point
prediction = models[shard_index].predict(np.array([unlearned_X]))
print(f"Prediction for unlearned data point: {prediction[0][0]:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
Prediction for unlearned data point: 0.0000


In [26]:
# Train a model on the original shard (including the unlearned data point)
original_model = build_rnn_model(vocab_size, embedding_dim, max_sequence_length)
original_model.fit(shards_X[shard_index], shards_y[shard_index], epochs=10, batch_size=32, verbose=0)

# Get the model's prediction before unlearning
prediction_before = original_model.predict(np.array([unlearned_X]))

# Get the model's prediction after unlearning
prediction_after = models[shard_index].predict(np.array([unlearned_X]))

print(f"Prediction before unlearning: {prediction_before[0][0]:.4f}")
print(f"Prediction after unlearning: {prediction_after[0][0]:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
Prediction before unlearning: 0.0000
Prediction after unlearning: 0.0000


In [27]:
# Duplicate the unlearned data point in the shard
shards_X[shard_index] = np.vstack([shards_X[shard_index], unlearned_X])
shards_y[shard_index] = np.append(shards_y[shard_index], unlearned_y)

# Retrain the model on the shard with the duplicated data point
original_model.fit(shards_X[shard_index], shards_y[shard_index], epochs=10, batch_size=32, verbose=0)

# Remove the duplicated data point and retrain
new_shard_X = np.delete(shards_X[shard_index], -1, axis=0)  # Remove the last duplicated data point
new_shard_y = np.delete(shards_y[shard_index], -1, axis=0)
unlearned_model.fit(new_shard_X, new_shard_y, epochs=10, batch_size=32, verbose=0)

# Compare predictions for the unlearned data point
prediction_original = original_model.predict(np.array([unlearned_X]))
prediction_unlearned = unlearned_model.predict(np.array([unlearned_X]))

print(f"Prediction with unlearned data point: {prediction_original[0][0]:.4f}")
print(f"Prediction without unlearned data point: {prediction_unlearned[0][0]:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Prediction with unlearned data point: 0.0000
Prediction without unlearned data point: 0.0000


## 7. How well did it unlearn

In [13]:
# Evaluate accuracy after unlearning
y_pred_after = aggregate_predictions(models, X_test)
accuracy_after = np.mean(y_pred_after.flatten() == y_test)
print(f"Accuracy after unlearning: {accuracy_after * 100:.2f}%")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Accuracy after unlearning: 77.00%


## Final conclushuns

In [14]:
print(f"Accuracy before unlearning: {accuracy_before * 100:.2f}%")
print(f"Accuracy after unlearning: {accuracy_after * 100:.2f}%")

Accuracy before unlearning: 77.00%
Accuracy after unlearning: 77.00%


# But wait did it actually unlearn

In [17]:
# Retrieve the unlearned data point
unlearned_X = shards_X[shard_index][data_point_index % shard_size]
unlearned_y = shards_y[shard_index][data_point_index % shard_size]

# Print the unlearned data point
print("Unlearned Data Point:")
print(f"Text: {df.iloc[data_point_index]['content']}")
print(f"Label: {unlearned_y}")

Unlearned Data Point:
Text: @TheBarmyArmy looking forward to div 2 next year @AndyBarmyArmy?  #leictershireaway #therey
Label: 0


In [18]:
# Get the model's prediction for the unlearned data point
prediction = models[shard_index].predict(np.array([unlearned_X]))
print(f"Prediction for unlearned data point: {prediction[0][0]:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Prediction for unlearned data point: 0.0000


In [19]:
# Train a model on the original shard (including the unlearned data point)
original_model = build_rnn_model(vocab_size, embedding_dim, max_sequence_length)
original_model.fit(shards_X[shard_index], shards_y[shard_index], epochs=10, batch_size=32, verbose=0)

# Get the model's prediction before unlearning
prediction_before = original_model.predict(np.array([unlearned_X]))

# Get the model's prediction after unlearning
prediction_after = models[shard_index].predict(np.array([unlearned_X]))

print(f"Prediction before unlearning: {prediction_before[0][0]:.4f}")
print(f"Prediction after unlearning: {prediction_after[0][0]:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Prediction before unlearning: 0.0000
Prediction after unlearning: 0.0000


In [20]:
# Evaluate accuracy before unlearning
y_pred_before = aggregate_predictions(models, X_test)
accuracy_before = np.mean(y_pred_before.flatten() == y_test)

# Evaluate accuracy after unlearning
y_pred_after = aggregate_predictions(models, X_test)
accuracy_after = np.mean(y_pred_after.flatten() == y_test)

print(f"Accuracy before unlearning: {accuracy_before * 100:.2f}%")
print(f"Accuracy after unlearning: {accuracy_after * 100:.2f}%")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Accuracy before unlearning: 77.00%
Accuracy after unlearning: 77.00%


In [21]:
# Retrain the model on the entire shard (including the unlearned data point)
original_model.fit(shards_X[shard_index], shards_y[shard_index], epochs=10, batch_size=32, verbose=0)

# Retrain the model on the shard excluding the unlearned data point
new_shard_X = np.delete(shards_X[shard_index], data_point_index % shard_size, axis=0)
new_shard_y = np.delete(shards_y[shard_index], data_point_index % shard_size, axis=0)
unlearned_model = build_rnn_model(vocab_size, embedding_dim, max_sequence_length)
unlearned_model.fit(new_shard_X, new_shard_y, epochs=10, batch_size=32, verbose=0)

# Compare predictions for the unlearned data point
prediction_original = original_model.predict(np.array([unlearned_X]))
prediction_unlearned = unlearned_model.predict(np.array([unlearned_X]))

print(f"Prediction with unlearned data point: {prediction_original[0][0]:.4f}")
print(f"Prediction without unlearned data point: {prediction_unlearned[0][0]:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
Prediction with unlearned data point: 0.0000
Prediction without unlearned data point: 0.0000


## More verifications

Shard size before removal: 1420
Shard size after removal: 1419




<keras.src.callbacks.history.History at 0x79fa0b5be4a0>