# Global parameters
# set those to run the notebook

In [None]:
glove_path = "glove/glove.42B.300d.zip"
glove_file = "glove.42B.300d.txt"

grandratings_dir = "Grand_etal_csv/"
grandfeatures_path = "SOMEPATH/features.xlsx"

In [None]:
import os
from scipy import stats
import numpy as np 
import pandas as pd
import zipfile
import math
import sklearn
import torch
import torch.optim as optim
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
import matplotlib.pyplot as plt



feature_dim = 300

word_vectors = { }

with zipfile.ZipFile(glove_path) as azip:
    with azip.open(glove_file) as f:
        for line in f:
            values = line.split()
            word = values[0].decode()
            vector = np.array(values[1:], dtype=np.float32)
            word_vectors[word] = vector

grandfeatures_df = pd.read_excel(grandfeatures_path)

# reading in Grand data
def read_grand_data(filename, grandratings_dir, grandfeatures_df):
    # extract category and feature
    grandcategory, grandfeature = filename[:-4].split("_")
        
    # read human ratings, make gold column
    df = pd.read_csv(grandratings_dir + filename)
    nspeakers = len(df.columns) -1
    df["Average"] = [row.iloc[1:26].sum() / nspeakers for _, row in df.iterrows()]
    # z-scores of average ratings
    df["Gold"] = (df["Average"] - df["Average"].mean()) / df["Average"].std()
        
    # obtain seed words from excel file
    relevant_row = grandfeatures_df[grandfeatures_df.Dimension == grandfeature]
    seedwords = relevant_row.iloc[:, 1:].values.flatten().tolist()
    pos_seedwords = seedwords[:3]
    neg_seedwords = seedwords[3:]
    
    return (grandcategory, grandfeature, pos_seedwords, neg_seedwords, df)

In [2]:
namestr = """north_dakota : 0
rhode-island : 0
johannesburg : 4
seoul : 4
south-carolina : 0
baghdad : 9
west-virginia : 0
jaime : 5
lahore : 2
south-dakota : 0
karachi : 7
nairobi : 6
north-carolina : 0
los-angeles : 0
new-hampshire : 0
tehran : 4
new-jersey : 0
san-francisco : 0
taipei : 5
new-mexico : 0
hong-kong : 5"""
citynames = [s.split(" : ")[0] for s in namestr.split("\n")]
citynames

['north_dakota',
 'rhode-island',
 'johannesburg',
 'seoul',
 'south-carolina',
 'baghdad',
 'west-virginia',
 'jaime',
 'lahore',
 'south-dakota',
 'karachi',
 'nairobi',
 'north-carolina',
 'los-angeles',
 'new-hampshire',
 'tehran',
 'new-jersey',
 'san-francisco',
 'taipei',
 'new-mexico',
 'hong-kong']

In [3]:
for n in citynames:
    if n not in word_vectors:
        print(n, "has no entry")

In [5]:
word_vectors["new-hampshire"]

array([ 6.6691e-01, -1.0600e+00, -1.2023e-01,  1.0052e-01, -4.1489e-01,
       -7.9596e-01,  4.9310e-01, -3.5292e-01, -1.4101e-01,  1.2679e-01,
       -7.3328e-01,  7.5202e-01, -5.8481e-01, -2.8259e-01, -8.1580e-02,
       -2.2716e-02,  1.2503e-01, -9.2844e-02, -2.1066e-01,  6.7123e-02,
        1.6662e-01,  1.4692e-01, -8.6194e-02, -4.7833e-01, -1.1225e-01,
        2.7158e-01, -2.1967e-01,  1.2433e-01,  5.6243e-01, -3.2761e-01,
       -4.7829e-01, -1.2528e-01, -6.6647e-01,  1.9598e-01,  1.7504e-01,
        9.6750e-01, -4.8978e-01,  3.8909e-01, -9.7805e-02, -1.2054e-01,
        4.3813e-01,  7.2000e-01, -4.2166e-01,  4.8310e-02,  6.8112e-01,
       -4.7780e-01, -1.7747e-01,  7.5757e-01,  5.3816e-01,  8.8436e-01,
        2.0889e-02, -4.6063e-01, -1.0409e-02,  1.7994e-01,  2.3012e-01,
        8.4747e-01, -2.6780e-01,  1.5260e-01, -3.8625e-03, -6.1371e-01,
        1.2008e-01, -5.2610e-01, -4.3554e-01, -6.5643e-01,  1.2445e+00,
        6.2660e-01,  1.0492e-01, -1.3906e-01, -4.7864e-01,  2.36