# Predicting the Countries from Capitals

### 1. Importing packages

In [1]:
import pickle
import numpy as np
import pandas as pd
import nltk
from gensim.models import KeyedVectors

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mahmoud\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### 2. Loading Google News embeddings

In [3]:
embeddings = KeyedVectors.load_word2vec_format('./src/GoogleNews-vectors-negative300.bin', binary = True)

**Defining a function to retrieve desired word embeddings**

**Inputs** :  
*embeddings*  
*set_words*: a set of desired words

**Outputs** :  
*word_embeddings*: a dictionary containing the words and their vectors

In [4]:
def get_word_embeddings(embeddings, set_words):

    word_embeddings = {}
    for word in embeddings.vocab:
        if word in set_words:
            word_embeddings[word] = embeddings[word]
    
    return word_embeddings

### 3. Loading the data

Laoding the word embeddings as a Python dictionary. Each of the word embedding is a 300-dimensional vector.

In [5]:
f = open('capitals.txt', 'r').read()
set_words = set(nltk.word_tokenize(f))
word_embeddings = get_word_embeddings(embeddings,set_words)

**Dumping the data to a pickle file**  
(There is no need to do this!)

In [6]:
pickle.dump(word_embeddings, open( "word_embeddings_capitals.p", "wb" ) )
word_embeddings = pickle.load(open("word_embeddings_capitals.p", "rb"))
len(word_embeddings)

230

### 4. Predicting relationships among words

**Defining cosine similarity function**

**Inputs** :  
*A*: a numpy array which corresponds to a word vector  
*B*: a numpy array which corresponds to a word vector

**Outputs** :  
*cos*: numerical number representing the cosine similarity between A and B

In [7]:
def cosine_similarity(A, B):
    
    dot = np.dot(A,B)
    norma = np.linalg.norm(A)
    normb = np.linalg.norm(B) 
    cos = dot / (norma*normb)

    return cos

**Defining euclidean distance function**

**Inputs** :  
*A*: a numpy array which corresponds to a word vector  
*B*: a numpy array which corresponds to a word vector

**Outputs** :  
*d*: numerical number representing the Euclidean distance between A and B

In [8]:
def euclidean(A, B):

    d = np.linalg.norm(A-B)

    return d

**Defining get country function**

**Inputs** :  
*city1*: a string (the capital city of country1)  
*country1*: a string (the country of capital1)  
*city2*: a string (the capital city of country2)  
*embeddings*: a dictionary where the keys are words and values are their embeddings  
*method*: method to use in the model (either cosine similarity "cos" or euclidean distance "d")

**Outputs** :  
*countries*: a dictionary with the most likely country and its similarity score

In [9]:
def get_country(city1, country1, city2, embeddings, method = "cos"):
    
    group = set([city1, country1, city2])
    city1_emb = embeddings[city1]
    country1_emb = embeddings[country1]
    city2_emb = embeddings[city2]

    vec = country1_emb - city1_emb + city2_emb

    country = ''

    if method == "cos":
        similarity = -1
        for word in embeddings.keys():
            if word not in group:
                word_emb = embeddings[word]
                cur_similarity = cosine_similarity(vec, word_emb)
                if cur_similarity > similarity:
                    similarity = cur_similarity
                    country = (word, similarity)
    
    if method == "d":
        distance = 10000
        for word in embeddings.keys():
            if word not in group:
                word_emb = embeddings[word]       
                cur_distance = euclidean(vec, word_emb)
                if cur_distance < distance:
                    distance = cur_distance
                    country = (word, distance)

    return country

**Testing the model**

In [10]:
predict_cos = get_country('Athens', 'Greece', 'Cairo', word_embeddings, "cos")
predict_d = get_country('Athens', 'Greece', 'Cairo', word_embeddings, "d")

print(predict_cos)
print(predict_d)

('Egypt', 0.7626821)
('Egypt', 2.4787965)


### 5. Model accuracy

**Loading the data**

In [11]:
data = pd.read_csv('capitals.txt', delimiter=' ')
data.columns = ['city1', 'country1', 'city2', 'country2']

data.head()

Unnamed: 0,city1,country1,city2,country2
0,Athens,Greece,Bangkok,Thailand
1,Athens,Greece,Beijing,China
2,Athens,Greece,Berlin,Germany
3,Athens,Greece,Bern,Switzerland
4,Athens,Greece,Cairo,Egypt


**Defining accuracy function**

**Inputs** :  
*word_embeddings*: a dictionary where the key is a word and the value is its embedding  
*data*: a pandas dataframe containing all the country and capital city pairs  
*method*: method to use in the model (either cosine similarity "cos" or euclidean distance "d")

**Outputs** :  
*accuracy*: the accuracy of the model

In [12]:
def get_accuracy(word_embeddings, data, method = "cos"):

    num_correct = 0

    for i, row in data.iterrows():
        city1 = data['city1'][i]
        country1 = data['country1'][i]
        city2 =  data['city2'][i]
        country2 = data['country2'][i]

        predicted_country2, _ = get_country(city1, country1, city2, word_embeddings, method)

        if predicted_country2 == country2:
            num_correct += 1

    m = len(data)

    accuracy = num_correct / m

    return accuracy

**Calculating accuracy**

In [13]:
accuracy1 = get_accuracy(word_embeddings, data, "cos")
accuracy2 = get_accuracy(word_embeddings, data, "d")
print(f"Accuracy with cosine similarity method is {accuracy1:.2f}")
print(f"Accuracy with euclidean distance method is {accuracy2:.2f}")

Accuracy with cosine similarity method is 0.92
Accuracy with euclidean distance method is 0.91
