In [1]:
import pickle
import itertools
import numpy as np
from numpy.linalg import norm
from collections import defaultdict

### Load and Combine
Combine the final MandarinL1 data and the first half of the final EnglishL1 data.

In [2]:
with open('MandarinL1-Final.pk', 'rb') as f:
    mandarin_sentences = pickle.load(f)
    
with open('EnglishL1-Final.pk', 'rb') as f:
    english_sentences = pickle.load(f)

In [7]:
# because of memory I will take the last 8000 of Mandarin and first 2000 of English
# this step can be skipped, just like in the ther notebooks so that the full data can be used
# this will also be done for the prepositions
mandarin_sentences = mandarin_sentences[-8000:]
english_sentences = english_sentences[:2000]

In [10]:
# combine the data
sentences = mandarin_sentences + english_sentences

### Task 1
Create a list of all the words you require for the training data

In [11]:
sentences[:10]

[['fish', 'hit'],
 ['I'],
 ['teacher', 'I'],
 ['she', 'bed', 'above', 'lie', 'prone'],
 ['she', 'again', 'hit', 'one'],
 ['I', 'she', 'hold', 'come', 'forward'],
 ['medicine', 'thereupon', 'ne', 'come', 'she', 'drink'],
 ['I', 'immediately', 'just', 'medicine', 'for', 'paste', 'good'],
 ['just', 'object', 'marker', 'medicine', 'paste', 'good', 'le'],
 ['I', 'immediately', 'just', 'medicine', 'for', 'paste', 'good']]

In [12]:
sentences_list = list(set(itertools.chain.from_iterable(sentences)))

In [13]:
sentences_list[:10]

['wake',
 'paddy',
 'Momma',
 'tired',
 'guideline',
 'slow',
 'friend',
 'tail',
 'Hainan',
 'regulate']

### Task 2 

In [14]:
with open('200/model.txt', 'r') as f:
    embeddings = f.readlines()

In [15]:
del embeddings[0]

In [16]:
embeddings[:10]

['xxxxxxxx_NUM 0.32421735 0.22470464 0.23266383 0.14655912 0.11806348 -0.06410626 0.20250964 -0.20516078 0.14369978 -0.108834706 0.13119985 0.2534583 -0.11221443 -0.3785554 0.13265009 -0.14101486 0.10878353 -0.086542174 0.03862669 -0.030116748 0.042733043 0.20331825 -0.018981108 0.08647962 0.30958673 0.106602415 0.029345442 -0.047587678 -0.117385566 -0.18458849 0.3424346 0.029885596 -0.058254648 0.019049209 -0.1345081 0.015706059 -0.14402625 0.0099786455 -0.08844093 0.063482 -0.2617411 0.086905174 0.3017144 -0.043240033 0.1997004 -0.41079625 0.30102134 -0.15309426 0.093146205 0.349268 -0.12036861 0.4594687 -0.08849048 0.056133755 0.057099562 -0.15918098 0.2638425 0.002492292 0.12339682 -0.024775533 0.03233719 0.058110476 0.14753725 0.17034164 0.05005482 0.1473185 -0.17367446 -0.15945962 -0.18605642 -0.14878857 0.045894105 -0.060525108 0.098379865 -0.20237799 0.17518407 0.08312823 0.0145293195 0.065795586 0.3285697 -0.30160302 -0.23532303 0.14140567 -0.034800507 0.1764754 0.122810096 -0

In [17]:
word_embeddings_dict = dict()

In [18]:
for embedding in embeddings:
    for word in sentences_list:
        if embedding.startswith(word) and word not in word_embeddings_dict.keys():
            word_embeddings_dict[word] = list(map(float, embedding.split()[1:]))
        else:
            pass

### Task 3
For each training item, create a single vector by summing the vectors for all the words.
The vectors are summed by dimension, i.e., summing the 1st dimension of all words, 2nd
dimension, and so on. The resulting vector will be a 300 dimension vector like the vectors
for the words.

In [19]:
sentences_l2_embeddings = []

In [20]:
for sentence in sentences:
    sum_temp = np.array([0.0]*300)
    for word in sentence:
        if word in word_embeddings_dict.keys():
            sum_temp += np.array(word_embeddings_dict[word])
        else: pass
    sentences_l2_embeddings.append(list(sum_temp))

In [21]:
# save the embeddings list
with open('Mandarin-English-L2-Vector-Embeddings.pk', 'wb') as f:
    pickle.dump(sentences_l2_embeddings, f)

### Mandarin-English-L2-Model 

In [22]:
# load the embeddings and preposition lists
with open('Mandarin-English-L2-Vector-Embeddings.pk', 'rb') as f:
    l2_embeddings = pickle.load(f)

In [23]:
# now I'll do the same loading and combining for the prepositions

In [24]:
with open('MandarinL1-Prepositions.pk', 'rb') as f:
    mandarin_prepositions = pickle.load(f)

with open('EnglishL1-Prepositions.pk', 'rb') as f:
    english_prepositions = pickle.load(f)

In [25]:
mandarin_prepositions = mandarin_prepositions[-8000:]
english_prepositions = english_prepositions[:2000]

In [26]:
l2_prepositions = mandarin_prepositions + english_prepositions
l2_prepositions[10]

'for'

In [27]:
len(l2_embeddings), len(l2_prepositions)

(10000, 10000)

In [28]:
# using scikit-learn for creating a KNN model 
from sklearn.cluster import KMeans

In [29]:
L2_model = KMeans(n_clusters=len(set(l2_prepositions)))

In [31]:
L2_model.fit(l2_embeddings)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=49, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [32]:
cluster_labels = L2_model.labels_

In [33]:
preposition_cluster = defaultdict(set)

for i, c in enumerate(cluster_labels):
    preposition_cluster[english_prepositions[c]].add(c)
    
preposition_cluster

defaultdict(set,
            {'with': {6, 8, 9, 11, 40, 47},
             'of': {23, 24, 46},
             'on': {1, 10, 12, 13, 37, 38, 39, 42, 43, 44, 45, 48},
             'at': {7, 22, 26, 27, 36},
             'in': {2, 3, 5, 16, 17, 28, 32, 33, 34, 35},
             'up': {31},
             'to': {21, 25, 30, 41},
             'for': {4, 15, 18, 19, 29},
             'like': {14, 20},
             'out': {0}})

In [34]:
cluster_sentence = defaultdict(list)

for i, c in enumerate(cluster_labels):
    cluster_sentence[c].append(l2_embeddings[i])
    
cluster_sentence[15]

[[0.7386932100000001,
  0.75471664,
  0.377605316,
  0.21209043,
  -0.5059863339999999,
  -0.19626329399999998,
  -0.060419231000000004,
  0.20958633999999998,
  0.342755526,
  0.352222815,
  0.582045747,
  0.36035618,
  -0.293796775,
  -0.481166096,
  0.55592603,
  0.02395013,
  -0.45257189400000003,
  -0.288686723,
  -0.09053910479999999,
  -0.41592284,
  -0.83193717,
  0.12533837999999997,
  0.37523438,
  0.334562614,
  -0.01956301000000002,
  -0.30514425,
  0.85148978,
  0.55519433,
  -0.22338903000000002,
  0.278024442,
  0.291695955,
  -0.04229334,
  -0.5821151,
  -0.10657922,
  0.099059903,
  0.07126271,
  -0.20634040950000002,
  -0.015890682000000003,
  0.48527989,
  0.723821,
  -0.381199187,
  0.21742045000000002,
  0.21995848999999998,
  -0.338502028,
  0.35625074,
  -0.7078816,
  0.82128586,
  0.144461756,
  0.19675332,
  0.35665203999999995,
  -0.44633243,
  0.19017700599999998,
  0.099238227,
  -0.183526007,
  0.006646900000000011,
  -0.155614564,
  1.00595128,
  0.1914663

In [35]:
# save the model and cluster - sentence and preposition - cluster mappings
with open('MandarinEnglishL2-Model.pk', 'wb') as f:
    pickle.dump(L2_model, f)
    
with open('MandarinEnglishL2-CS.pk', 'wb') as f:
    pickle.dump(cluster_sentence, f)
    
with open('MandarinEnglishL2-PC.pk', 'wb') as f:
    pickle.dump(preposition_cluster, f)

# Testing data
Cosine similarity. Goal is to generate the cosine similarities of all training items and test sentence for a particular preposition, the get the top N

In [36]:
import pandas as pd
import pickle
import torch

In [37]:
test_data = pd.read_csv('testData.csv', encoding = "ISO-8859-1")
test_data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,QNumber,Choice,Preposition,Question,FullQ,Particpants,Correct,EnglishL1Model,MandarinL1Model
0,0,0,1,A,on,Now? you don't eat these all ___ once,Now? you don't eat these all on once,0.045455,0,0.27472,0.214543
1,1,1,1,B,at,Now? you don't eat these all ___ once,Now? you don't eat these all at once,0.625,1,0.276695,0.356372
2,2,2,1,C,like,Now? you don't eat these all ___ once,Now? you don't eat these all like once,0.193182,0,0.273671,0.214543
3,3,3,1,D,as,Now? you don't eat these all ___ once,Now? you don't eat these all as once,0.136364,0,0.174914,0.214543
4,4,4,2,A,with,So you're happy ___ your work?,So you're happy with your work?,0.761364,1,0.28031,0.25


In [38]:
test_prepositions = list(test_data['Preposition'])
test_prepositions[:5]

['on', 'at', 'like', 'as', 'with']

In [39]:
# load the preprocessed test data
with open('Test-Vector-Embeddings.pk', 'rb') as f:
    test_sentences_vectors = pickle.load(f)

In [40]:
with open('MandarinEnglishL2-CS.pk', 'rb') as f:
    cluster_sentence = pickle.load(f)
    
with open('MandarinEnglishL2-PC.pk', 'rb') as f:
    preposition_cluster = pickle.load(f)

In [41]:
# cosine similarity function
def cos(t, q):
    return np.dot(t, q) / (norm(t)*norm(q))

In [42]:
test_v = []
for i in test_sentences_vectors:
    test_v += [i]*4

test_p = test_prepositions

In [43]:
cosine_similarity_list = []

for prep in test_p:
    cosine_similarities_per_prep = [] # cosine similarity for every preposition
    prep_clusters = preposition_cluster[prep] # get the clusters per preposition
    if prep_clusters == set():
        cosine_similarity_list.append(0.0)
    else:
        for cluster in prep_clusters:
            training_items = cluster_sentence[cluster]# get the list of items/vectors for a cluster
            for item in training_items:
                cosine_similarity = cos(item, test_v[0]) # calculate the cosine similarity for item in the list of a cluster
                cosine_similarities_per_prep.append(cosine_similarity) # append the cos sim to cos sim per prep
        cosine_similarity_list.append(torch.topk(torch.tensor(cosine_similarities_per_prep), 1).values.item())
        # append the final gen of cos sim into this list. each gen should equal 
        # the number of training items for all clusters combined
        # I used the topk where N = 3

  This is separate from the ipykernel package so we can avoid doing imports until


In [44]:
cosine_similarity_list[:10]

[0.579353732630298,
 0.6205590748421804,
 0.5243926766593068,
 0.0,
 nan,
 0.0,
 0.5703650330000184,
 0.579353732630298,
 0.0,
 0.579353732630298]

In [45]:
# create the list for sum of scores
score_4_sums = []
for i in range(0, len(cosine_similarity_list), 4):
    score_4_sums += [sum(cosine_similarity_list[i:i+4])]*4

In [46]:
score_4_sums[:5]

[1.7243054841317853,
 1.7243054841317853,
 1.7243054841317853,
 1.7243054841317853,
 nan]

In [47]:
def normalize(score, score_4_sum):
    return (score + 1) / (score_4_sum + 4)

In [48]:
final_score = []

for score in range(len(cosine_similarity_list)):
    final_score.append(normalize(cosine_similarity_list[score], score_4_sums[score]))

In [49]:
final_score[:5]

[0.27590311820506225,
 0.28310143114033565,
 0.2663017689892757,
 0.17469368166532637,
 nan]

In [50]:
# testing if per sentence sum equals 1
sum(final_score[:4])

1.0

In [51]:
l2_model_scores = pd.Series(final_score)

In [52]:
test_data = test_data.merge(l2_model_scores.rename('MandarinEnglishL2Model'), left_index=True, right_index=True)

In [53]:
test_data.to_csv('testData.csv')