# Computational Neuroscience Final Project
## Elizabeth Frey

In [71]:
%pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Import Data

In [1]:
import pandas as pd
import numpy as np

# read csv file
df = pd.read_csv('data-en-hi-de-fr.csv')

In [40]:
from tensorflow.keras.utils import to_categorical

# remove any rows with null values
df.dropna(inplace=True)

# group the dataframe by labels column
grouped = df.groupby('labels')

# create two dataframes, one for ham and one for spam
ham_df = grouped.get_group('ham')
spam_df = grouped.get_group('spam')
ham_df = ham_df.drop('labels', axis=1)
spam_df = spam_df.drop('labels', axis=1)

# pivot the data so that each row is a phrase in a language
ham_df = ham_df.rename(
    columns={'text': 'en', 'text_hi': 'hi', 'text_de': 'de', 'text_fr': 'fr'})
ham_df = ham_df.melt(var_name='language', value_name='phrase')
spam_df = spam_df.rename(
    columns={'text': 'en', 'text_hi': 'hi', 'text_de': 'de', 'text_fr': 'fr'})
spam_df = spam_df.melt(var_name='language', value_name='phrase')

# create a dictionary of the languages
languages = {'en': 0, 'hi': 1, 'de': 2, 'fr': 3}

# convert the languages to numbers
ham_df['language_code'] = ham_df['language'].map(languages)
spam_df['language_code'] = spam_df['language'].map(languages)

# # one hot encode the languages using to categorical
# # this is taken from project part 1


# def one_hot(i):
#     one_hot_encoded = np.array([0]*4)
#     one_hot_encoded[i-1] = 1
#     return np.array(one_hot_encoded)


# ham_df['language_one_hot'] = ham_df['language_code'].apply(one_hot)
# spam_df['language_one_hot'] = spam_df['language_code'].apply(one_hot)

## Embeddings

Creates functions to test out differente embeddings.

In [41]:
# define a function to tokenize the phrases in each dataframe


def tokenize(df):
    return [phrase.split() for phrase in df['phrase']]


# tokenize the phrases in each dataframe
ham_tokens = tokenize(ham_df)
spam_tokens = tokenize(spam_df)

ham_df['tokens'] = ham_tokens
spam_df['tokens'] = spam_tokens

### Word2Vec Embeddings

In [42]:
import gensim
from gensim.models import Word2Vec


# train the word2vec model on the ham and spam tokens
model = Word2Vec(ham_tokens + spam_tokens, min_count=1)

# create the embeddings for each dataframe
ham_df['w2v_embedding'] = ham_df['tokens'].apply(
    lambda x: model.wv[x].mean(axis=0).astype(np.float32))
spam_df['w2v_embedding'] = spam_df['tokens'].apply(
    lambda x: model.wv[x].mean(axis=0).astype(np.float32))

## Train the Model

In [89]:
# create training and testing datasets from the ham and spam dataframes
from sklearn.model_selection import train_test_split

X = pd.concat([ham_df['w2v_embedding'], spam_df['w2v_embedding']])
y = pd.concat([ham_df['language_code'], spam_df['language_code']])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

y_train_one_hot = to_categorical(y_train, 4)
y_test_one_hot = to_categorical(y_test, 4)

# convert the training and testing dataframes to numpy arrays
X_train = np.array([x for x in X_train])
X_test = np.array([x for x in X_test])
y_train = np.array([y for y in y_train_one_hot])
y_test = np.array([y for y in y_test_one_hot])

# reshape the training and testing dataframes
X_train = X_train.reshape(X_train.shape[0], 1, 100)
X_test = X_test.reshape(X_test.shape[0], 1, 100)

In [84]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential()
model.add(layers.GRU(64, input_shape=(1, 100)))
model.add(layers.BatchNormalization())
model.add(layers.Dense(units=4, activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer="adam",
    metrics=["accuracy"],
)

# Display the model summary
model.summary()

Model: "sequential_19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_19 (GRU)                (None, 64)                31872     
                                                                 
 batch_normalization_17 (Ba  (None, 64)                256       
 tchNormalization)                                               
                                                                 
 dense_17 (Dense)            (None, 4)                 260       
                                                                 
Total params: 32388 (126.52 KB)
Trainable params: 32260 (126.02 KB)
Non-trainable params: 128 (512.00 Byte)
_________________________________________________________________


In [85]:
tf.debugging.disable_traceback_filtering()

# Assuming X_train and y_train are NumPy arrays
X_train_tensor = tf.convert_to_tensor(X_train, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float32)

print(X_train_tensor.shape)
print(y_train_tensor.shape)


model.fit(
    X_train_tensor, y_train_tensor, batch_size=64, epochs=10
)

(17830, 1, 100)
(17830, 4)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2ce05e250>

In [86]:
for i in range(10):
    result = tf.argmax(model.predict(tf.expand_dims(X_test[i], 0)), axis=1)
    print(result.numpy(), y_test[i])

[1] [0. 0. 1. 0.]
[2] [0. 0. 0. 1.]
[2] [0. 0. 1. 0.]
[1] [0. 0. 1. 0.]
[1] [0. 1. 0. 0.]
[0] [0. 1. 0. 0.]
[2] [0. 1. 0. 0.]
[0] [0. 0. 1. 0.]
[1] [0. 1. 0. 0.]
[0] [0. 1. 0. 0.]


In [91]:
# Assuming X_test and y_test are NumPy arrays
X_test_tensor = tf.convert_to_tensor(X_test, dtype=tf.float32)
y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.float32)

print(X_test_tensor.shape)
print(y_test_tensor.shape)

model.evaluate(X_test_tensor, y_test_tensor)

(4458, 1, 100)
(4458, 4)
  1/140 [..............................] - ETA: 1s - loss: 0.0642 - accuracy: 0.9688



[0.07282949984073639, 0.975549578666687]