<a href="https://colab.research.google.com/github/mr-alamdari/NLP-Find-Capitals-For-Countries-Beginner/blob/main/NLP_Find_Capitals_For_Countries_Beginner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import os
import re
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
!wget https://raw.githubusercontent.com/amanjeetsahu/Natural-Language-Processing-Specialization/master/Natural%20Language%20Processing%20with%20Classification%20and%20Vector%20Spaces/Week%203/capitals.txt

--2022-04-16 12:48:40--  https://raw.githubusercontent.com/amanjeetsahu/Natural-Language-Processing-Specialization/master/Natural%20Language%20Processing%20with%20Classification%20and%20Vector%20Spaces/Week%203/capitals.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 154424 (151K) [text/plain]
Saving to: ‘capitals.txt’


2022-04-16 12:48:40 (6.50 MB/s) - ‘capitals.txt’ saved [154424/154424]



In [6]:
data = pd.read_csv('capitals.txt', delimiter=' ')
data.columns = ['city1', 'country1', 'city2', 'country2']

In [7]:
data.head(10)

Unnamed: 0,city1,country1,city2,country2
0,Athens,Greece,Bangkok,Thailand
1,Athens,Greece,Beijing,China
2,Athens,Greece,Berlin,Germany
3,Athens,Greece,Bern,Switzerland
4,Athens,Greece,Cairo,Egypt
5,Athens,Greece,Canberra,Australia
6,Athens,Greece,Hanoi,Vietnam
7,Athens,Greece,Havana,Cuba
8,Athens,Greece,Helsinki,Finland
9,Athens,Greece,Islamabad,Pakistan


In [14]:
!wget https://raw.githubusercontent.com/amanjeetsahu/Natural-Language-Processing-Specialization/master/Natural%20Language%20Processing%20with%20Classification%20and%20Vector%20Spaces/Week%203/word_embeddings_subset.p

--2022-04-16 13:13:43--  https://raw.githubusercontent.com/amanjeetsahu/Natural-Language-Processing-Specialization/master/Natural%20Language%20Processing%20with%20Classification%20and%20Vector%20Spaces/Week%203/word_embeddings_subset.p
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 309156 (302K) [application/octet-stream]
Saving to: ‘word_embeddings_subset.p’


2022-04-16 13:13:44 (8.98 MB/s) - ‘word_embeddings_subset.p’ saved [309156/309156]



In [15]:
word_embeddings = pickle.load(open("word_embeddings_subset.p", "rb"))

In [20]:
len(word_embeddings), len(word_embeddings['village'])

(243, 300)

In [39]:
def cosine_similarity(a, b):
  #1
  return a.dot(b) / (np.sqrt(np.sum(a**2)) * np.sqrt(np.sum(b**2)))
  #2
  # return a.dot(b) / (np.sqrt(a.dot(a)) * np.sqrt(b.dot(b)))
  #3
  # return a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [40]:
king = word_embeddings['king']
queen = word_embeddings['queen']

In [41]:
cosine_similarity(king, queen)

0.65109557

In [42]:
def euclidean_distance(a, b):
  return np.linalg.norm(a-b)

In [43]:
euclidean_distance(king, queen)

2.4796925

In [64]:
def get_country(city1, country1, city2, embeddings):
 
  city1_emb = word_embeddings.get(city1) 
  country1_emb = word_embeddings.get(country1)
  city2_emb = word_embeddings.get(city2) 

  vec = country1_emb - city1_emb + city2_emb
  best_similarity_rate = -1

  for word in embeddings.keys():
    
    if word not in [city1, country1, city2]:
      word_emb = embeddings[word]      
      similarity = cosine_similarity(vec, word_emb)

      if best_similarity_rate < similarity:
        best_similarity_rate = similarity
        country = (word, best_similarity_rate)

  return country

In [65]:
get_country('Athens', 'Greece', 'Cairo', word_embeddings)

('Egypt', 0.7626821)

In [77]:
def total_accuracy(data, word_embeddings):

  num_correct = 0
  
  for i, row in data.iterrows():
    city1, country1 = row['city1'], row['country1']
    
    city2, country2 = row['city2'], row['country2']

    predicted_country = get_country(city1, country1, city2, word_embeddings)
    
    if predicted_country[0] == country2:
      num_correct += 1 

  accuracy = num_correct / len(data)
  return accuracy, num_correct

In [78]:
total_accuracy(data, word_embeddings)

(0.9192082407594425, 4551)

In [79]:
def pca(x, num_compunents=2):

  x_mean = x - np.mean(x, axis=0)
  covariance_matrix = np.cov(x_mean, rowvar=False)
  eigen_values, eigen_vectors = np.linalg.eigh(covariance_matrix, UPLO='L')
  idx_sorted = np.argsort(eigen_values)
  idx_sorted_decreasing = idx_sorted[::-1]
  eigen_values_sorted = eigen_values[idx_sorted_decreasing]
  eigen_vectors_sorted = eigen_vectors[:, idx_sorted_decreasing]
  eigen_vectors_subset = eigen_vectors_sorted[:, 0:num_compunents]
  x_reduced = np.dot(eigen_vectors_subset.transpose(), x_mean.transpose()).transpose()

  return x_reduced

In [83]:
x = np.random.rand(3, 10)
pca(x, 2)

array([[-0.62128474, -0.17764614],
       [ 0.15244508,  0.61207238],
       [ 0.46883967, -0.43442625]])