# Connectionist modelling workshop 
## Learning the English Past Tense

In this worksheet we'll build a neural network to simulate Past Tense acquisition in English. 




### Prepare the code

Run this cell at the beginning to load necessary modules and code

In [None]:
#CELL1

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from random import seed, random
import random
from csv import reader

# Load a CSV file with numeric content
def load_csv_numeric(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue

            dataset.append([float(x) for x in row])
    return dataset

#load a csv file with char or string content
def load_csv_str(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:

            if not row:
                continue

            dataset.append(row[0])
    return dataset

# Calculate neuron activation for an input
def activate(weights, inputs):

    activation = weights[-1]
    for i in range(len(weights)-1):

        activation += weights[i] * inputs[i]
    return activation

def transfer(activation):
    return 1.0 / (1.0 + np.exp(-activation))

# Calculate the derivative of an neuron output
def transfer_derivative(output):
    return output * (1.0 - output)

# Initialize a network
def initialize_network(n_inputs, n_hidden, n_outputs):
    network = []
    hidden_layer = [{'weights':[random.random() for i in range(n_inputs + 1)]} for i in range(n_hidden)]
    network.append(hidden_layer)
    output_layer = [{'weights':[random.random() for i in range(n_hidden + 1)]} for i in range(n_outputs)]
    network.append(output_layer)
    print('architecture:')
    print('input units:'+str(n_inputs))
    print('hidden units:'+str(n_hidden))
    print('output units:'+str(n_outputs))
    print('------------------')

    return network


def network(train, traintarget, test, testtarget, l_rate, n_epoch, n_hidden, momentum=0.9,plot_error=False):
    print('initialize network')
    n_inputs = len(train[0])
    n_outputs = len(traintarget[0])

    network = initialize_network(n_inputs, n_hidden, n_outputs)
    
    trainerror=train_backprop(network, train,traintarget, l_rate, n_epoch, momentum, plot_error)
    testerror=test_network(network, test, testtarget)
    return network, trainerror, testerror

# Train a network for a fixed number of epochs
def train_backprop(network, train, traintarget, l_rate, n_epoch, momentum=.9, plot_error=False, cat_vector=[]):
    print('train_backprop')
    rms_error=[]
    if len(cat_vector)>0:
        cats=list(set(cat_vector))
        print(cats)
        ncats=len(cats)
        cat_errors=[]
        for c in range(ncats):
            cat_errors.append([])

    for epoch in range(n_epoch):

        print('epoch:'+str(epoch))
        sum_error = 0
        if len(cat_vector)>0:
            cat_sum_errors= [0] * ncats
            cat_sum_n=[0]*ncats
        for j in range(len(train)):

            if len(cat_vector)>0:
                this_cat=cat_vector[j]
                this_cat_idx=cats.index(this_cat)
            outputs = forward_propagate(network, train[j])
            #print('got outputs')
            expected = traintarget[j]

            sum_error += sum([(expected[i]-outputs[i])**2 for i in range(len(expected))])
            if len(cat_vector)>0:
                cat_sum_errors[this_cat_idx] += sum([(expected[i]-outputs[i])**2 for i in range(len(expected))])
                cat_sum_n[this_cat_idx] +=1
            backward_propagate_error(network, expected)
            if momentum>0:
                update_weights_momentum(network, train[j], l_rate, momentum)
            else:
	            update_weights(network, train[j], l_rate)
        epoch_rms_error = np.sqrt(sum_error/len(train))
        rms_error.append(epoch_rms_error)
        if len(cat_vector)>0:

            for c in range(ncats):
                epoch_cat_rms_error=np.sqrt(cat_sum_errors[c]/cat_sum_n[c])
                cat_errors[c].append(epoch_cat_rms_error)

        print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, epoch_rms_error))
    if plot_error:
        if len(cat_vector)>0:
            plt.figure()
            plt.plot(range(n_epoch), rms_error)
            for c in range(ncats):
                plt.plot(range(n_epoch),cat_errors[c])
            plt.legend(['overall']+cats)
            plt.xlabel('epoch')
            plt.ylabel('error')
            plt.show()
        else:
            plt.figure()
            plt.plot(range(n_epoch), rms_error)
            plt.xlabel('epoch')
            plt.ylabel('error')
            plt.show()
    return rms_error[-1]


# Neural network using Backpropagation Algorithm With Stochastic Gradient Descent
#training phase only
#option for showing error by category
def network_train_only(train, traintarget, l_rate, n_epoch, n_hidden, momentum=0.9,plot_error=False, cat_vector=[]):
    print('initialize network')
    n_inputs = len(train[0])
    n_outputs = len(traintarget[0])

    network = initialize_network(n_inputs, n_hidden, n_outputs)
    
    trainerror=train_backprop(network, train,traintarget, l_rate, n_epoch, momentum, plot_error,cat_vector)
    return network, trainerror

#calculate error for test set (frozen weights, no training)
def test_network(network, test, testtarget):
    testoutputs=[]
    error_sum = 0
    for i in range(len(test)):
        this_testoutput = predict(network, test[i])
        testoutputs.append(this_testoutput)

        this_target=testtarget[i]
        error_sum+=sum([(this_target[i]-this_testoutput[i])**2 for i in range(len(this_target))])
    rms_error=np.sqrt(error_sum/len(test))
    return rms_error

# Update network weights with momentum
def update_weights_momentum(network, row, l_rate, momentum):
    for i in range(len(network)):
        inputs = row[:-1]
        if i != 0:
            inputs = [neuron['output'] for neuron in network[i - 1]]
        for neuron in network[i]:
            for j in range(len(inputs)):
                if not 'delta_prev' in neuron.keys():
                    neuron['weights'][j] += (l_rate * neuron['delta'] * inputs[j])
                    neuron['delta_prev']=[0 for i in range(len(inputs))]
                    neuron['delta_prev'][j] = (l_rate * neuron['delta'] * inputs[j])
                else:
                    neuron['weights'][j] += (l_rate * neuron['delta'] * inputs[j]) + (momentum*neuron['delta_prev'][j])
                    neuron['delta_prev'][j]=(l_rate * neuron['delta'] * inputs[j]) + (momentum*neuron['delta_prev'][j])
            neuron['weights'][-1] += (l_rate * neuron['delta']) + (momentum*neuron['delta_prev'][-1])
            neuron['delta_prev'][-1]=(l_rate * neuron['delta']) + (momentum*neuron['delta_prev'][-1])

# Update network weights with error
def update_weights(network, row, l_rate):
    for i in range(len(network)):
        inputs = row[:-1]
        if i != 0:
            inputs = [neuron['output'] for neuron in network[i - 1]]
        for neuron in network[i]:
            for j in range(len(inputs)):
                neuron['weights'][j] += l_rate * neuron['delta'] * inputs[j]
            neuron['weights'][-1] += l_rate * neuron['delta']
            
            
def backward_propagate_error(network, expected):

    for i in reversed(range(len(network))):

        layer = network[i]
        errors = list()
        if i != len(network)-1:
            for j in range(len(layer)):
                error = 0.0
                for neuron in network[i + 1]:
                    error += (neuron['weights'][j] * neuron['delta'])
                errors.append(error)
        else:
            for j in range(len(layer)):
                neuron = layer[j]
                errors.append(expected[j] - neuron['output'])
        for j in range(len(layer)):
            neuron = layer[j]
            neuron['delta'] = errors[j] * transfer_derivative(neuron['output'])
            
# Forward propagate input to a network output
def forward_propagate(network, row):
    #print('forwardprop')
    inputs = row
    for layer in network:
        new_inputs = []
        for neuron in layer:
            activation = activate(neuron['weights'], inputs)
            neuron['output'] = transfer(activation)
            new_inputs.append(neuron['output'])
        inputs = new_inputs
    return inputs

# Make a prediction with a network
def predict(network, row):
    outputs = forward_propagate(network, row)
    return outputs

#show the output for a single test pattern (given by idx)
def test_single_pattern(network, test, testtarget, testlabels, testtargetlabels, idx):
    this_testoutput=predict(network, test[idx])
    this_target = testtarget[idx]
    error_pattern = [(this_target[i] - this_testoutput[i]) for i in range(len(this_target))]


    x = np.arange(len(this_target))  # the label locations
    width = 0.35  # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(x - width / 2, this_target, width, label='Target')
    rects2 = ax.bar(x + width / 2, this_testoutput, width, label='Output')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Output activation')
    ax.set_title('Target vs. Output for pattern ' +str(idx) +':' + testlabels[idx] + '/' + testtargetlabels[idx])
    ax.set_xticks(x)
    ax.set_xticklabels(x)
    ax.legend()


    fig.tight_layout()

    plt.show()



    return this_testoutput, error_pattern

#train a network and plot the error for a single pattern given by idx
def network_log_pattern(train, traintarget,  trainlabel, traintargetlabel,l_rate, n_epoch, n_hidden, idx=0, momentum=0.9):
    print('call network')
    n_inputs = len(train[0])
    n_outputs = len(traintarget[0])

    network = initialize_network(n_inputs, n_hidden, n_outputs)

    trainerror = train_backprop_log_pattern(network, train, traintarget, trainlabel, traintargetlabel, l_rate, n_epoch, idx, momentum)

    return network, trainerror


#run backprop and plot the error for a single pattern given by idx
def train_backprop_log_pattern(network, train, traintarget, trainlabel, traintargetlabel, l_rate, n_epoch, idx=0, momentum=0.9, plot_error=False):
    print('train_backprop')
    rms_error=[]
    pattern_error=[]

    for epoch in range(n_epoch):

        print('epoch:'+str(epoch))
        sum_error = 0

        for j in range(len(train)):



            outputs = forward_propagate(network, train[j])
            #print('got outputs')
            expected = traintarget[j]
            if j==idx:
                pattern_error.append(sum([(expected[i]-outputs[i])**2 for i in range(len(expected))]))
            sum_error += sum([(expected[i]-outputs[i])**2 for i in range(len(expected))])

            backward_propagate_error(network, expected)
            if momentum>0:
                update_weights_momentum(network, train[j], l_rate, momentum)
            else:
	            update_weights(network, train[j], l_rate)
        epoch_rms_error = np.sqrt(sum_error/len(train))
        rms_error.append(epoch_rms_error)

        print('>epoch=%d, lrate=%.3f, error=%.6f' % (epoch, l_rate, epoch_rms_error))

        #print(len(pattern_error))
        #print(len(range(n_epoch)))
    plt.figure()
    plt.plot(range(n_epoch), pattern_error)
    plt.xlabel('epoch')
    plt.ylabel('error')
    plt.title('error for pattern: '+trainlabel[idx] + '/' + traintargetlabel[idx])
    plt.show()
    return rms_error[-1]

            

### Preparing the data

One of the main points of the past tense simulation is generalisability, or the realisation that as human speakers of English we can easily create a past tense form for a new word, like wug --> wugged. As discussed in the lecture we're going to use a training set and a separate test set, so that we can train the model on some data but test with unseen cases how well the model generalises.

The following code will create eight variables:<br>
`trainInput` (numeric input to neural network for training)<br>
`trainInputLabels` (transcriptions of input to neural network / training)<br>
`trainTarget` (numeric target output for neural network/ training)<br>
`trainTargetLabels` (transcriptions of target output / training)<br>
`testInput` (numeric input to neural network for testing)<br>
`testInputLabels` (transcriptions of input for testing)<br>
`testTarget` (numeric target outputs for testing)<br>
`testTargetLabels` (transcriptions of target for testing)<br>

In [None]:
#CELL2


trainInput = load_csv_numeric('PastTenseData/input.csv')
trainInputLabels= load_csv_str('PastTenseData/inputlabels.csv')
trainTarget = load_csv_numeric('PastTenseData/target.csv')
trainTargetLabels= load_csv_str('PastTenseData/targetlabels.csv')
testInput = load_csv_numeric('PastTenseData/testInput.csv')
testInputLabels= load_csv_str('PastTenseData/testInputLabels.csv')
testTarget = load_csv_numeric('PastTenseData/testTarget.csv')
testTargetLabels= load_csv_str('PastTenseData/testTargetLabels.csv')

### Inspect the data

Run the following cell to display, for all the loaded variables, their size. Make sure you understand why they have this size.

In [None]:
#CELL 3

print('trainInput size:'+str(len(trainInput)))
print('pattern length:'+str(len(trainInput[0])))
print('--')
print('trainTarget size:'+str(len(trainTarget)))
print('pattern length:'+str(len(trainTarget[0])))
print('--')
print('testInput size:'+str(len(testInput)))
print('pattern length:'+str(len(testInput[0])))
print('--')
print('testTarget size:'+str(len(testTarget)))
print('pattern length:'+str(len(testTarget[0])))


As you probably guessed, we have a training set of 450 input patterns with corresponding targets, and a test set of 50 input patterns with corresponding targets. Those are not going to be included in training, so that we can test generalisation.

The training patterns come in four categories, for convenience referred to as W,X,Y,Z. <br>
W: vowelchange verbs (e.g. sing/sang)<br>
X: no-change verbs (e.g. hit/hit)<br>
Y: regular verbs with voiced suffix (e.g. smile/smiled)<br>
Z: regular verbs with voiceless suffix (e.g. kiss/kissed)<br>
The following cell displays two patterns and their targets per category. Make sure you understand how the stem/suffix are represented. 
**Tip:** You may have to resize the window to display the patterns neatly.

In [None]:
#CELL4 

print('W: vowelchange')
print(trainInput[7])
print(trainTarget[7])
print(trainInputLabels[7] + '-'+trainTargetLabels[7])
print('--')
print(trainInput[39])
print(trainTarget[39])
print(trainInputLabels[39] + '-'+trainTargetLabels[39])
print('%%%%%%%%%%%%')
print('X: no change')
print(trainInput[10])
print(trainTarget[10])
print(trainInputLabels[10] + '-'+trainTargetLabels[10])
print('--')
print(trainInput[211])
print(trainTarget[211])
print(trainInputLabels[211] + '-'+trainTargetLabels[211])
print('%%%%%%%%%%%%')
print('Y: voiced -ed')
print(trainInput[2])
print(trainTarget[2])
print(trainInputLabels[2] + '-'+trainTargetLabels[2])
print('--')
print(trainInput[4])
print(trainTarget[4])
print(trainInputLabels[4] + '-'+trainTargetLabels[4])
print('%%%%%%%%%%%%')
print('Y: voiceless -ed')
print(trainInput[0])
print(trainTarget[0])
print(trainInputLabels[0] + '-'+trainTargetLabels[0])
print('--')
print(trainInput[5])
print(trainTarget[5])
print(trainInputLabels[5] + '-'+trainTargetLabels[5])

We're going to set up a network with 20 hidden units that we'll use to train to associate the stems from the training set with their corresponding past tense form. <br>
**Task 1:** How many input and output units are we going to need? Try to answer before running the next cell.

In [None]:
#CELL 5

[this_network, trainerror, testerror]=network(trainInput, trainTarget, testInput, testTarget, l_rate=.1, n_epoch=100,n_hidden=20, plot_error=True)
print('final training error:'+str(round(trainerror,2)))
print('final test error:'+str(round(testerror,2)))

**Task 2:** Did this network train successfully? How do you know?<br>
**Task 3:** In CELL5, change the learning rate, number of epochs and number of hidden units repeatedly. Try a few different combinations. Don't forget to write down the final training/testing error. Can you improve the performance beyond your initial run? (You may want to continue your search for optimal parameters at home so that you can get through more tasks in class. <br>

***
Run the network in CELL5 with the following parameters:<br>
`l_rate`=0.1<br>
`n_epoch`=100<br>
`n_hidden`=20<br>


**Task 4:** Describe the training progress. In terms of training error, what would be the optimal time to stop training? Run the network again, with that number of epochs. Was this also good for the test error?

So far, we have only talked about quality in terms of error, but does the network actually get the answers right? Let's inspect individual patterns.
**Run CELL 6 to see the target and actual output for the first item in the test set.**

In [None]:
#CELL 6
idx = 6
[output_pattern, error_pattern]=test_single_pattern(this_network, testInput, testTarget,testInputLabels, testTargetLabels, idx)

Now, repeat CELL6 but change the index `idx` (you may have to scroll to the right to see how it is used) to see the performance of the network for other patterns. Try at least one pattern from each category W,X,Y,Z (hint: you can find patterns on the basis of their categories by looking at the file TestTargetLabels.csv in excel, but remember that python starts counting at 0, so for the pattern displayed in row 5 in excel, change idx in CELL6 to 4.)

**Task 5:** How is the network doing particularly on the final two test outputs which correspond to the category? How is it performing for vowel change patterns on units 6-11, which correspond to the vowel in the stem?

***
You're probably finding it difficult to answer Task 5 in a quantitative way from just inspecting individual patterns. How much better would it be if we could look at each category's learning progress separately over time? Voil&#224;, this is what we do in CELL7. **Run CELL7 now**

In [None]:
#CELL7

train_categories=load_csv_str('PastTenseData/trainTargetCategories.csv')
test_categories=load_csv_str('PastTenseData/testTargetCategories.csv')
[this_network, trainerror]=network_train_only(trainInput, trainTarget, .1, 20,20, plot_error=True, cat_vector=train_categories)

Run CELL 7 a few times to watch what happens consistently and what is specific to each simulation (remember: they are initialised with random weights).<br> 
**Task 6:** Can you explain why the different categories have different trajectories (i.e. why the error decreases faster for some categories than for others)?

### U-shaped curves

So far, we've seen the error decreasing, mostly in a monotonous way. Where are the error curves that we said were so important for Past Tense simulations? <br>
Firstly, remember Plunkett \& Markman's point about macro- vs. micro-U-shapes. They said that actually the idea that there was a U-shape when averaging across all verbs was a bit misleading: that would occur if children made mistakes with all (or most) verbs simultaneously. Instead, what they found in acquisition diaries was that children would go through phases of over-regularising individual words, but not necessarily others (simultaneously). If that was the case in the neural networks, then we should only see a U-shape (decreasing error, then increasing error) for individual words, not when averaging. <br>
So, let's plot the learning curves for individual verbs!<br>
**Run CELL8 now**

In [None]:
#CELL 8

idx=7
[this_network, trainerror]=network_log_pattern(trainInput, trainTarget, trainInputLabels, trainTargetLabels, .1, 20,20, idx)




**Task 7** Find a few different words for each category from the excel sheet `targetLabels.csv` and substitute their index no. (remember: python starts counting at 0) in CELL8 to observe the learning progress for regular (Y,Z) vs. irregular (W,X) verbs. Can you find evidence for micro-U-shapes?

***
In order to evaluate the network's performance systematically, one would have to search the parameter space systematically, i.e. run lots of simulations (systematically varying learning rate, number of hidden units, number of epochs, and run each combination multiple times to be able to calculate an average trajectory for different random starting points) and observe a number of verbs from each class. This is beyond what we can do in class, but feel free to play with the network for longer in your own time!

***
Well done, you've now got quite a bit of experience in modelling. Challenge yourself in the future and try to imagine how one could build a model for the experiments you encounter in your reading!

----
Workshop developed by Nadja Althaus
Code adapted from https://machinelearningmastery.com/implement-backpropagation-algorithm-scratch-python/