# Vector representation - MandarinL1-Vector + Model

In [6]:
import pickle
import itertools
import numpy as np
from numpy.linalg import norm
from collections import defaultdict

### Task 1
Create a list of all the words you require for the training data

In [2]:
with open('MandarinL1-Final.pk', 'rb') as f:
    sentences = pickle.load(f)

In [3]:
sentences[:10]

[['again', 'mother', 'de', 'speak', 'again'],
 ['tool', 'which', 'side', 'a1'],
 ['little', 'little', 'great', 'kid', 'door', 'open'],
 ['small', 'small', 'great', 'kid', 'door', 'open', 'open', 'go'],
 ['mother', 'get', 'off', 'back', 'family', 'work', 'le', 'one'],
 ['tool', 'which', 'ne'],
 ['originally', 'here'],
 ['originally', 'here', 'a1'],
 ['originally', 'here', 'a1'],
 ['this', 'apple', 'is', 'here', 'ma']]

In [4]:
sentences_list = list(set(itertools.chain.from_iterable(sentences)))

In [5]:
sentences_list[-10:]

['body',
 'incident',
 'regards',
 'tomato',
 'rock',
 'kings',
 'idea',
 'tape',
 'male',
 'lots']

### Task 2 

In [9]:
with open('200/model.txt', 'r') as f:
    embeddings = f.readlines()

In [10]:
del embeddings[0]

In [11]:
embeddings[:10]

['xxxxxxxx_NUM 0.32421735 0.22470464 0.23266383 0.14655912 0.11806348 -0.06410626 0.20250964 -0.20516078 0.14369978 -0.108834706 0.13119985 0.2534583 -0.11221443 -0.3785554 0.13265009 -0.14101486 0.10878353 -0.086542174 0.03862669 -0.030116748 0.042733043 0.20331825 -0.018981108 0.08647962 0.30958673 0.106602415 0.029345442 -0.047587678 -0.117385566 -0.18458849 0.3424346 0.029885596 -0.058254648 0.019049209 -0.1345081 0.015706059 -0.14402625 0.0099786455 -0.08844093 0.063482 -0.2617411 0.086905174 0.3017144 -0.043240033 0.1997004 -0.41079625 0.30102134 -0.15309426 0.093146205 0.349268 -0.12036861 0.4594687 -0.08849048 0.056133755 0.057099562 -0.15918098 0.2638425 0.002492292 0.12339682 -0.024775533 0.03233719 0.058110476 0.14753725 0.17034164 0.05005482 0.1473185 -0.17367446 -0.15945962 -0.18605642 -0.14878857 0.045894105 -0.060525108 0.098379865 -0.20237799 0.17518407 0.08312823 0.0145293195 0.065795586 0.3285697 -0.30160302 -0.23532303 0.14140567 -0.034800507 0.1764754 0.122810096 -0

In [12]:
word_embeddings_dict = dict()

In [13]:
for embedding in embeddings:
    for word in sentences_list:
        if embedding.startswith(word) and word not in word_embeddings_dict.keys():
            word_embeddings_dict[word] = list(map(float, embedding.split()[1:]))
        else:
            pass

### Task 3
For each training item, create a single vector by summing the vectors for all the words.
The vectors are summed by dimension, i.e., summing the 1st dimension of all words, 2nd
dimension, and so on. The resulting vector will be a 300 dimension vector like the vectors
for the words.

In [20]:
sentences_mandarin_embeddings = []

In [21]:
# using the first 10000 beacuse of memory constrains
for sentence in sentences[:10000]:
    sum_temp = np.array([0.0]*300)
    for word in sentence:
        if word in word_embeddings_dict.keys():
            sum_temp += np.array(word_embeddings_dict[word])
        else: pass
    sentences_mandarin_embeddings.append(list(sum_temp))

In [None]:
# save the embeddings list
with open('MandarinL1-Vector-Embeddings.pk', 'wb') as f:
    pickle.dump(sentences_mandarin_embeddings, f)

### Mandarin-L1-Model 

In [None]:
# load the embeddings and preposition lists
with open('MandarinL1-Vector-Embeddings.pk', 'rb') as f:
    mandarin_embeddings = pickle.load(f)

In [15]:
with open('MandarinL1-Prepositions.pk', 'rb') as f:
    mandarin_prepositions = pickle.load(f)

In [23]:
sentences_mandarin_embeddings[10]

[-0.08191656,
 0.34606254,
 -0.5510396,
 0.72513336,
 -0.27576572,
 -0.11681467,
 0.15062217,
 0.1777799,
 -0.38423017,
 0.040765524,
 0.40598375,
 -0.079609916,
 0.6699437,
 0.4292677,
 0.48038837,
 0.071910724,
 0.24380377,
 -0.48099446,
 -0.4353189,
 -0.034510512,
 -0.1657101,
 0.16970402,
 0.21134958,
 0.023116311,
 -1.1068813,
 -0.4697977,
 -0.020929523,
 -0.34902132,
 0.12834324,
 -0.075691506,
 0.682579,
 -0.3113258,
 0.13534261,
 -0.23971936,
 0.098731935,
 -0.4014684,
 -0.11307005,
 -0.045864284,
 0.14369439,
 0.37023395,
 0.23371376,
 -0.0854537,
 -0.13085958,
 -0.33539996,
 -0.4760007,
 -0.087404475,
 -0.46105677,
 -0.10524693,
 0.055732332,
 0.10183573,
 -0.333681,
 -0.101663426,
 0.17800234,
 0.9237124,
 0.23956947,
 -0.05447914,
 -0.3215452,
 -0.313669,
 -0.562636,
 -0.24103042,
 0.12404497,
 0.5984253,
 0.31462038,
 -0.14538799,
 -0.12509653,
 0.14908215,
 -0.3429332,
 0.11759935,
 -0.048157528,
 -0.0642322,
 -0.3987206,
 0.38841188,
 -0.45341203,
 -0.29616737,
 -0.13886

In [26]:
mandarin_prepositions = mandarin_prepositions[:10000]
mandarin_prepositions[10]

'for'

In [27]:
len(sentences_mandarin_embeddings), len(mandarin_prepositions)

(10000, 10000)

In [29]:
# using scikit-learn for creating a KNN model 
from sklearn.cluster import KMeans

In [30]:
MandarinL1_model = KMeans(n_clusters=len(set(mandarin_prepositions)))

In [31]:
MandarinL1_model.fit(sentences_mandarin_embeddings)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=34, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [32]:
# save the model
with open('MandarinL1-Model.pk', 'wb') as f:
    pickle.dump(MandarinL1_model, f)

In [33]:
cluster_labels = MandarinL1_model.labels_

In [34]:
# Try to identify which cluters represent which words and sentences

In [36]:
preposition_cluster = defaultdict(set)

In [37]:
for i, c in enumerate(cluster_labels):
    preposition_cluster[mandarin_prepositions[c]].add(c)

In [38]:
preposition_cluster

defaultdict(set,
            {'for': {10, 11, 12, 28, 29, 30, 31},
             'object marker': {0, 2, 3, 15, 16, 26, 32, 33},
             'at': {1, 5, 6, 7, 8, 9, 13, 14, 17, 21, 23, 24, 25, 27},
             'when': {18},
             'according to': {20, 22},
             'up until': {4},
             'go toward': {19}})

In [39]:
cluster_sentence = defaultdict(list)

In [40]:
for i, c in enumerate(cluster_labels):
    cluster_sentence[c].append(sentences_mandarin_embeddings[i])

In [41]:
cluster_sentence[15]

[[0.36906219,
  1.30302735,
  0.33961303600000003,
  -0.406480925,
  -0.5040601100000001,
  -0.563951295,
  0.42081528999999995,
  -0.546641256,
  -0.607357975,
  -0.14903509320000002,
  1.079297559,
  0.8068617199999999,
  0.16080056599999998,
  -0.004516542999999915,
  0.3152443286999999,
  -1.306021776,
  0.013957909999999935,
  -0.845706164,
  0.5041129849999999,
  -0.44775642000000004,
  1.13852179,
  -0.73553399,
  0.9095800600000001,
  1.2929196700000003,
  -1.19879623,
  0.43924802,
  0.31154473400000005,
  0.9136247989999999,
  -1.076582156,
  -0.686314766,
  0.26014893,
  -0.08852660399999995,
  -0.28021275999999995,
  0.19014459540000006,
  0.97437567,
  -0.77611577,
  1.04567433,
  0.8249694200000001,
  0.15302767999999994,
  1.28871162,
  -1.1784846660000001,
  0.198517347,
  -0.5091067624000001,
  -1.061387702,
  -0.52319884,
  -0.11080004600000001,
  0.13700908000000006,
  0.35960206000000006,
  0.014598974999999959,
  0.880266806,
  0.547276871,
  -0.058842689000000004,

In [42]:
# save cluster - sentence and preposition - cluster mappings    
with open('MandarinL1-CS.pk', 'wb') as f:
    pickle.dump(cluster_sentence, f)
    
with open('MandarinL1-PC.pk', 'wb') as f:
    pickle.dump(preposition_cluster, f)

# Testing data
Cosine similarity

In [7]:
import pandas as pd
import pickle
import torch

In [2]:
test_data = pd.read_csv('testData.csv', encoding = "ISO-8859-1")
test_data.head()

Unnamed: 0.1,Unnamed: 0,QNumber,Choice,Preposition,Question,FullQ,Particpants,Correct,EnglishL1Model
0,0,1,A,on,Now? you don't eat these all ___ once,Now? you don't eat these all on once,0.045455,0,0.27472
1,1,1,B,at,Now? you don't eat these all ___ once,Now? you don't eat these all at once,0.625,1,0.276695
2,2,1,C,like,Now? you don't eat these all ___ once,Now? you don't eat these all like once,0.193182,0,0.273671
3,3,1,D,as,Now? you don't eat these all ___ once,Now? you don't eat these all as once,0.136364,0,0.174914
4,4,2,A,with,So you're happy ___ your work?,So you're happy with your work?,0.761364,1,0.28031


In [3]:
test_prepositions = list(test_data['Preposition'])

In [4]:
test_prepositions[:5]

['on', 'at', 'like', 'as', 'with']

In [8]:
# load the preprocessed test data
with open('Test-Vector-Embeddings.pk', 'rb') as f:
    test_sentences_vectors = pickle.load(f)

In [9]:
with open('MandarinL1-CS.pk', 'rb') as f:
    cluster_sentence = pickle.load(f)
    
with open('MandarinL1-PC.pk', 'rb') as f:
    preposition_cluster = pickle.load(f)

In [10]:
# cosine similarity function
def cos(t, q):
    return np.dot(t, q) / (norm(t)*norm(q))

In [11]:
test_v = []
for i in test_sentences_vectors:
    test_v += [i]*4

test_p = test_prepositions

In [12]:
cosine_similarity_list = []

for prep in test_p:
    cosine_similarities_per_prep = [] # cosine similarity for every preposition
    prep_clusters = preposition_cluster[prep] # get the clusters per preposition
    if prep_clusters == set():
        cosine_similarity_list.append(0.0)
    else:
        for cluster in prep_clusters:
            training_items = cluster_sentence[cluster]# get the list of items/vectors for a cluster
            for item in training_items:
                cosine_similarity = cos(item, test_v[0]) # calculate the cosine similarity for item in the list of a cluster
                cosine_similarities_per_prep.append(cosine_similarity) # append the cos sim to cos sim per prep
        cosine_similarity_list.append(torch.topk(torch.tensor(cosine_similarities_per_prep), 1).values.item())
        # append the final gen of cos sim into this list. each gen should equal 
        # the number of training items for all clusters combined
        # I used the topk where N = 3

In [13]:
cosine_similarity_list[:10]

[0.0, 0.6610771640917705, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [14]:
# create the list for sum of scores
score_4_sums = []
for i in range(0, len(cosine_similarity_list), 4):
    score_4_sums += [sum(cosine_similarity_list[i:i+4])]*4

In [15]:
score_4_sums[:5]

[0.6610771640917705,
 0.6610771640917705,
 0.6610771640917705,
 0.6610771640917705,
 0.0]

In [16]:
def normalize(score, score_4_sum):
    return (score + 1) / (score_4_sum + 4)

In [17]:
final_score = []

for score in range(len(cosine_similarity_list)):
    final_score.append(normalize(cosine_similarity_list[score], score_4_sums[score]))

In [18]:
final_score[:5]

[0.21454268290253764,
 0.3563719512923872,
 0.21454268290253764,
 0.21454268290253764,
 0.25]

In [19]:
# testing if per sentence sum equals 1
sum(final_score[:4])

1.0

In [20]:
mandarin_model_scores = pd.Series(final_score)

In [22]:
test_data = test_data.merge(mandarin_model_scores.rename('MandarinL1Model'), left_index=True, right_index=True)

In [23]:
test_data.to_csv('testData.csv')