In [23]:
## If I give input as "[1,1,0,1]" it must give the same copy [1,1,0,1]
import numpy as np

## Training dataset
random_ints = np.random.randint(low=1, high=1000, size=10000)
random_ints

array([870, 268, 959, ..., 213, 223,  11], shape=(10000,), dtype=int32)

In [24]:
# Convert each integer to a binary list (e.g., 5 -> [1, 0, 1])
def int_to_bin_list(n, width=10):  # fixed width for uniformity
    return [int(bit) for bit in format(n, f'0{width}b')]

# Apply to dataset
X = [int_to_bin_list(n) for n in random_ints]
y = X.copy()

In [25]:
## If I give input as "[1,1,0,1]" it must give the same copy [1,1,0,1]

class Copy_Lstm:
    def __init__(self):
        # self.X= X
        # self.Y = y

        ## Forget gate parameter
        self.W_f = np.random.randn()
        self.U_f = np.random.randn()
        self.b_f = np.random.randn()

        ## Input gate parameters
        self.W_i = np.random.randn()
        self.U_i = np.random.randn()
        self.b_i = np.random.randn()

        ## Candidate cell state parameters (aka C_tilde)
        self.W_C = np.random.randn()
        self.U_C = np.random.randn()
        self.b_C = np.random.randn()

        # Output gate parameters
        self.W_o = np.random.randn()   # weight for hidden state (h_{t-1})
        self.U_o = np.random.randn()   # weight for input (x_t)
        self.b_o = np.random.randn()   # bias

        ## Final output layer (from hidden state to prediction)
        self.W_y = np.random.randn()
        self.b_y = np.random.randn()

    def tanh(self,x):
        return np.tanh(x)
    

    def tanh_derivative(self, x):
        return 1 - np.tanh(x)**2

    
    def sigmoid(self,x):
        return 1/(1+np.exp(-x))
    
    def sigmoid_derivative(self, x):
        sig = self.sigmoid(x)
        return sig * (1 - sig)

    
    def forward(self):
        ## Initialize memory states
        C_t = 0 ## cell state(long-term memory)
        h_t = 0 ## Hidden state (Short-Term memory)

        self.inputs = []
        self.f_t, self.i_t, self.C_tilde, self.C_t, self.o_t, self.h_t = [], [],[],[],[],[]
        self.y_hat, self.probs, self.y_pred = [],[],[]
        self.z_f , self.z_i, self.z_C, self.z_o = [],[],[],[]

        for x_t in self.X:
            self.inputs.append(x_t)

            ## FOrget gate
            z_f = (h_t * self.W_f) + (x_t * self.U_f) + self.b_f
            f_t = self.sigmoid(z_f)

            ## Input gate
            z_i = (h_t * self.W_i) + (x_t * self.U_i) + self.b_i
            i_t = self.sigmoid(z_i)

            ## Candidate cell state
            z_C = (h_t * self.W_C) + (x_t * self.U_C) + self.b_C
            C_tilde = self.tanh(z_C)


            ## Update Cell State
            C_t = (f_t * C_t) + (i_t * C_tilde)

            ## Output gate
            z_o = (h_t * self.W_o) + (x_t * self.U_o) + self.b_o
            o_t = self.sigmoid(z_o)

            ## Hidden state
            h_t = o_t * self.tanh(C_t)

            ## Save intermediate values
            self.f_t.append(f_t)
            self.i_t.append(i_t)
            self.C_tilde.append(C_tilde)
            self.C_t.append(C_t)
            self.o_t.append(o_t)
            self.h_t.append(h_t)
            self.z_f.append(z_f)
            self.z_i.append(z_i)
            self.z_C.append(z_C)
            self.z_o.append(z_o)

            ## Final prediction
            y_hat_t = self.W_y * h_t + self.b_y
            prob = self.sigmoid(y_hat_t)
            y_pred = 1 if prob > 0.5 else 0

            self.y_hat.append(y_hat_t)
            self.probs.append(prob)
            self.y_pred.append(y_pred)



    def backward(self, lr = 0.01):
        
        dl_w_f = np.zeros_like(self.W_f)
        dl_w_i = np.zeros_like(self.W_i)
        dl_w_C = np.zeros_like(self.W_C)
        dl_w_o = np.zeros_like(self.W_o)
        dl_w_y = np.zeros_like(self.W_y)

        dl_b_f = np.zeros_like(self.b_f)
        dl_b_i = np.zeros_like(self.b_i)
        dl_b_C = np.zeros_like(self.b_C)
        dl_b_o = np.zeros_like(self.b_o)
        dl_b_y = np.zeros_like(self.b_y)

        dl_U_f = np.zeros_like(self.U_f)
        dl_U_i = np.zeros_like(self.U_i)
        dl_U_C = np.zeros_like(self.U_C)
        dl_U_o = np.zeros_like(self.U_o)
        dl_h_next = 0
        dl_C_next = 0

        for t in reversed(range(len(self.X))):



        ## Using Cross Entropy with Sigmoid so we get
            dl_y_hat = self.probs[t] - self.Y[t]

            ## Output layer
            dl_w_y += dl_y_hat * self.h_t[t]
            dl_h_t = dl_y_hat * self.W_y + dl_h_next
            dl_b_y += dl_y_hat

            dl_o_t = dl_h_t * self.tanh(self.C_t[t])
            dl_C_t = dl_h_t * self.o_t[t] * self.tanh_derivative(self.C_t[t]) + dl_C_next
            dl_z_o = dl_o_t * self.sigmoid_derivative(self.z_o[t])
            dl_w_o += dl_z_o * (self.h_t[t-1] if t > 0 else 0)
            

            dl_U_o += dl_z_o * self.inputs[t]
            dl_b_o += dl_z_o

            ## Input Layer
            dl_C_tilde = dl_C_t * self.i_t[t]
            dl_z_c = dl_C_tilde * self.tanh_derivative(self.z_C[t])
            dl_w_C += dl_z_c * (self.h_t[t-1] if t>0 else 0)
            dl_U_C += dl_z_c * self.inputs[t]
            dl_b_C += dl_z_c


            dl_i_t = dl_C_t * self.C_tilde[t]
            dl_z_i = dl_i_t * self.sigmoid_derivative(self.z_i[t])
            dl_w_i += dl_z_i * (self.h_t[t-1] if t>0 else 0)
            dl_U_i += dl_z_i * self.inputs[t]
            dl_b_i += dl_z_i

            ## Forget gate
            dl_f_t = dl_C_t * (self.C_t[t-1] if t > 0 else 0)
            dl_z_f = dl_f_t * self.sigmoid_derivative(self.z_f[t])
            dl_w_f += dl_z_f * (self.h_t[t-1] if t > 0 else 0)
            dl_U_f += dl_z_f * self.inputs[t]
            dl_b_f += dl_z_f

            dl_h_prev = dl_z_o * self.W_o ## For h[t-1] because till this point in forward pass we had not got h_t
            dl_h_prev += dl_z_f * self.W_f
            dl_h_prev += dl_z_i * self.W_i
            dl_h_prev += dl_z_c * self.W_C

            # Carry-over for next step
            dl_h_next = dl_h_prev
            dl_C_next = dl_C_t * self.f_t[t]

        ## Update weights
        self.W_f -= lr * dl_w_f
        self.W_i -= lr * dl_w_i
        self.W_C -= lr * dl_w_C
        self.W_o -= lr * dl_w_o
        self.W_y -= lr * dl_w_y

        self.b_f -= lr * dl_b_f
        self.b_i -= lr * dl_b_i
        self.b_C -= lr * dl_b_C
        self.b_o -= lr * dl_b_o
        self.b_y -= lr * dl_b_y

        self.U_f -= lr * dl_U_f
        self.U_i -= lr * dl_U_i
        self.U_C -= lr * dl_U_C
        self.U_o -= lr * dl_U_o









        

    def train(self, dataset_X, dataset_Y, epochs=100, lr=0.01, verbose=True):
    
        losses = []

        for epoch in range(epochs):
            epoch_loss = 0

            # Go through each training sample
            for X_sample, Y_sample in zip(dataset_X, dataset_Y):
                self.X = X_sample
                self.Y = Y_sample

                # Forward pass
                self.forward()

                # Compute binary cross-entropy loss for this sample
                eps = 1e-8
                loss = -np.mean([
                    y * np.log(p + eps) + (1 - y) * np.log(1 - p + eps)
                    for y, p in zip(self.Y, self.probs)
                ])
                epoch_loss += loss

                # Backward pass (weight update)
                self.backward(lr=lr)

            # Average loss across all samples
            epoch_loss /= len(dataset_X)
            losses.append(epoch_loss)

            # Print progress occasionally
            if verbose and epoch % max(1, epochs // 10) == 0:
                print(f"Epoch {epoch}/{epochs} | Loss: {epoch_loss:.6f}")

        return losses


    def predict(self, X):
        self.X = X
        self.forward()
        return self.y_pred






In [26]:
## Test
lstm = Copy_Lstm()


In [27]:
lstm.train(dataset_X=X, dataset_Y=y)

Epoch 0/100 | Loss: 0.026917
Epoch 10/100 | Loss: 0.000165
Epoch 20/100 | Loss: 0.000084
Epoch 30/100 | Loss: 0.000057
Epoch 40/100 | Loss: 0.000043
Epoch 50/100 | Loss: 0.000034
Epoch 60/100 | Loss: 0.000029
Epoch 70/100 | Loss: 0.000024
Epoch 80/100 | Loss: 0.000021
Epoch 90/100 | Loss: 0.000019


[np.float64(0.02691706017993916),
 np.float64(0.0012403818622056509),
 np.float64(0.000714594021098259),
 np.float64(0.0005039767116976332),
 np.float64(0.0003896324139417969),
 np.float64(0.00031768643364465254),
 np.float64(0.0002682070904411658),
 np.float64(0.00023207911238816935),
 np.float64(0.0002045357199733751),
 np.float64(0.00018283976502764666),
 np.float64(0.00016530670154237014),
 np.float64(0.00015084269537014722),
 np.float64(0.0001387064440269511),
 np.float64(0.00012837768738697887),
 np.float64(0.00011948050623358758),
 np.float64(0.00011173649645792508),
 np.float64(0.00010493506655486902),
 np.float64(9.891397099798986e-05),
 np.float64(9.354618678936637e-05),
 np.float64(8.873084551836227e-05),
 np.float64(8.43868301448531e-05),
 np.float64(8.044816544238959e-05),
 np.float64(7.686064198412129e-05),
 np.float64(7.357930494344e-05),
 np.float64(7.056655980330111e-05),
 np.float64(6.77907250988382e-05),
 np.float64(6.522491376051502e-05),
 np.float64(6.2846159179609

In [34]:
X_test = [1, 1, 1, 0, 1, 1, 1, 1, 1, 0]

y_pred = lstm.predict(X_test)
print("Predicted:", y_pred)


Predicted: [1, 1, 1, 0, 1, 1, 1, 1, 1, 0]
