In [59]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from gensim.models import Word2Vec, FastText
import pandas as pd
import re

import numpy as np

In [60]:
# Read the file and generate list of sentences
def read_file_to_list_of_senteces(filename):
    sentences = []
    with open(filename, 'r') as f:
        for line in f:
            # Lower case all the words
            #print(line.split())
            sentence = [w.lower() for w in line.split()]
            #print(sentence)
            sentences.append(sentence)
    print("Total sentences read: " + str(len(sentences)))
    return sentences

#filename = "../data/dev_langs/Swedish.bible.txt"
#sentences = read_file_to_list_of_senteces(filename)

In [61]:
# generate word embeddings
def generate_word_embeddings(sentences):
    w2v = Word2Vec(sentences, min_count=1, size = 5)
    return w2v

#w2v = generate_word_embeddings(sentences)

In [62]:
#words = list(w2v.wv.vocab)
#words

In [63]:
#X = w2v[w2v.wv.vocab]

In [64]:
#n= len(words)
#n

In [65]:
def generate_clustering(n_clusters, word_embeddings):
    kmeans_clustering = KMeans(n_clusters=n_clusters, random_state=0).fit(word_embeddings)
    return kmeans_clustering

#kmeans_clustering = generate_clustering(200, X)

In [66]:
#centers = kmeans_clustering.cluster_centers_
#centers

In [69]:
def create_cluster_center_dict(center_indices, w2v_model):
    cluster_center_dicts= dict()

    for c in center_indices:    
        word_rep = w2v_model.most_similar([c], [], topn=1)[0][0]
        tuple_c = tuple(c)
        cluster_center_dicts[tuple_c] = word_rep
    return cluster_center_dicts
    
#cluster_center_dicts = create_cluster_center_dict(centers, w2v)

In [38]:
def create_paradigm_dict(kmeans_clustering, centers_dict, X):
    print(len(centers_dict))
    print(len(X))
    
    paradigm_dict = {k:[] for k in centers_dict.values()}
    
    print("Running predict on all training values")
    y_hats = kmeans_clustering.predict(X)
    print("Prediction complete")
    for x, y in zip(X, y_hats):
        center_embedding = centers[y]
        cluster_key = centers_dict[tuple(center_embedding)]
        
        x_word = w2v.most_similar([x], [], topn=1)[0][0]
        paradigm_dict[cluster_key].append(x_word)
    print("Finished creating paradigm dict")
    return paradigm_dict
        
    

In [43]:
def write_result_to_file(paradigm_dict, output_file):
    f = open(output_file, 'w')
    l_count = 0
    print(paradigm_dict.keys())
    for lemma in paradigm_dict.keys():
        f.write(lemma)
        f.write("\n")
        surface_forms = paradigm_dict[lemma]
        for sf in surface_forms:
            f.write(sf)
            f.write("\n")
        f.write("\n")
        f.write("\n")
        l_count +=1
    print(str(l_count) + "Lemmas written to file " + output_file)
    f.close()
        

In [49]:
output_file_name = "../output_files/swedish_cluster_out.txt"
paradigm_dict = create_paradigm_dict(kmeans_clustering, cluster_center_dicts, X)
write_result_to_file(paradigm_dict, output_file_name)

200
24207
Running predict on all training values
Prediction complete


  


Finished creating paradigm dict
dict_keys(['lågorna', 'kommer', 'svår', 'bergfästena', 'hedmarken', 'måste', 'oväntat', 'föresätta', 'tapenes', 'nu', 'eder', 'verkställa', 'gingo', 'uppenbarare', 'parningstiden', 'ditt', 'tillräknas', 'appfia', 'fyra', 'gott', 'skevas', 'basuner', 'törstiga', 'bestå', 'avkomlingar', 'utmark', 'högtidligt', '“', 'prästen', 'dessa', 'for', 'betel', 'vattenbäckar', 'herrens', 'oförtänkt', 'granatäpplen', 'härjar', 'morgonen', 'o', 'edra', 'överträdelsers', 'alnar', 'allsmäktige', 'hade', 'glupskhet', 'hermons', 'fram', 'förhasta', 'röja', 'övertäckte', 'oroligheterna', 'måtta', 'jordans', 'vem', 'skriftlärdes', 'bevise', 'främlingar', 'kungöra', 'runtomkring', 'tjugusjunde', 'draga', 'hundra', 'än', 'strutsarna', 'bot', 'kom', 'sängs', 'intalar', 'blygd', 'herre', 'missunnsamhet', 'tempelinvigningens', 'glädjas', '-', 'son', 'äta', 'skröpligheter', 'bortfrätas', 'höjder', 'har', 'gjort', 'hosaja', 'styggelse*.', 'juda', 'diamantgriffel', 'leva', 'tillsamm

In [48]:
from 2021Task2.evaluate.eval import eval
swedish_gold = "../data/dev_langs/Swedish.dev.gold"
eval(output_file_name, swedish_gold)

SyntaxError: invalid syntax (<ipython-input-48-de378bd589d8>, line 1)

In [56]:
def execute_kmeans_clustering_for_file(filename, num_clusters, output_file):
    sentences = read_file_to_list_of_senteces(filename)
    w2v = generate_word_embeddings(sentences)

    # Create list of word embeddings for all words in the vocab to fit the model
    X = w2v[w2v.wv.vocab]

    # Create a kmeans clustering
    kmeans_clustering = generate_clustering(num_clusters, X)
    centers = kmeans_clustering.cluster_centers_
    cluster_center_dicts = create_cluster_center_dict(centers, w2v)
    
    # Generate paradigm dict and write to file
    paradigm_dict = create_paradigm_dict(kmeans_clustering, cluster_center_dicts, X)
    write_result_to_file(paradigm_dict, output_file)


In [57]:
# Run KMeans on Swedish
filename = "../data/dev_langs/Swedish.bible.txt"
swedish_2000_output = "../output_files/Swedish_kmeans_cluster_2000.txt"

print("Running for 2000 clusters")
execute_kmeans_clustering_for_file(filename, 2000, swedish_2000_output)

swedish_4000_output = "../output_files/Swedish_kmeans_cluster_4000.txt"
print("Running for 4000 clusters")
execute_kmeans_clustering_for_file(filename, 4000, swedish_4000_output)

# Run Kmeans on Russian
print("Running for Russian")
russian_filename = "../data/dev_langs/Russian.bible.txt"

print("Running for 200 clusters")
russian_200_output = "../output_files/Russian_kmeans_cluster_200.txt"
execute_kmeans_clustering_for_file(russian_filename, 200, russian_200_output)

print("Running for 2000 clusters")
russian_2000_output = "../output_files/Russian_kmeans_cluster_2000.txt"
execute_kmeans_clustering_for_file(russian_filename, 2000, russian_2000_output)

print("Running for 4000 clusters")
russian_4000_output = "../output_files/Russian_kmeans_cluster_4000.txt"
execute_kmeans_clustering_for_file(russian_filename, 4000, russian_4000_output)



Running for 2000 clusters
Total sentences read: 43904


  


AttributeError: 'NoneType' object has no attribute 'cluster_centers_'