In [None]:
%matplotlib inline

# FOR TRANSLATED VERSOIN
import os
import sys
import nltk
import sklearn
import csv
import re
import collections
from scipy.spatial.distance import pdist, squareform
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import json
import numpy as np


# Downloads the NLTK stopword corpus if not already downloaded
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer

# sklearn modules for data processing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# sklearn modules for LSA
from sklearn.decomposition import TruncatedSVD

# sklearn modules for classification
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# sklearn modules for clustering
from sklearn.cluster import KMeans

from textblob import TextBlob
from googletrans import Translator

from emoji import UNICODE_EMOJI
# from translate import Translator


def process_document(text):

    text = text.lower()

    tokenizer = RegexpTokenizer(r'(?:\w+)|(?:😤)| ?:[^0-9a-zA-Z\s] ')
    
    tokenized = tokenizer.tokenize(text)
    
    stop_words = stopwords.words('spanish')
    filtered = []

    for word in tokenized:
        if word not in stop_words:
            filtered.append(word)

    return filtered

def read_data(data_dir):
    corpus = []
    with open(data_dir, errors='ignore', encoding='utf-8') as words_file:
        csv_reader = csv.reader(words_file, delimiter = ',')
        data = []
        for row in csv_reader:
            data.append(row[0])
    
    return data

    
def construct_vocab(corpus):
    """
        Input: A list of list of string. Each string represents a word token.
        Output: A tuple of dicts: (vocab, inverse_vocab)
                vocab : A dict mapping str -> int. This will be your vocabulary.
                inverse_vocab: Inverse mapping int -> str
    """
    vocab = {}
    inverse_vocab = {}
    id_count = 0

    for sentence in corpus:
        for word in sentence:
            if word not in vocab:
                vocab[word] = id_count
                inverse_vocab[id_count] = word
                id_count += 1
    return (vocab, inverse_vocab)

def word_counts(corpus):
    """ Given a corpus (such as returned by load_corpus), return a dictionary
        of word frequencies. Maps string token to integer count.
    """
    return collections.Counter(w for s in corpus for w in s)

def trunc_vocab(corpus, counts):
    """ Limit the vocabulary to the 10k most-frequent words. Remove rare words from
         the original corpus.
        Input: A list of list of string. Each string represents a word token.
        Output: A tuple (new_corpus, new_counts)
                new_corpus: A corpus (list of list of string) with only the 10k most-frequent words
                new_counts: Counts of the 10k most-frequent words

        Hint: Sort the keys of counts by their values
    """
    new_counts = {}
    new_corpus = []
    new_counts = collections.Counter(counts).most_common(200)
    top_words = set(dict(new_counts).keys())

    for sentence in corpus:
        sent = []
        for word in sentence:
            if word in top_words and word != "https" and word != "amp" and word != "co":
                sent.append(word)
        new_corpus.append(sent)

    new_counts = dict(new_counts)
    
    
    return new_corpus, new_counts

def word_vectors(corpus, vocab):
    """
        Input: A corpus (list of list of string) and a vocab (word-to-id mapping)
        Output: A lookup table that maps [word id] -> [word vector]
    """

    # each word vector is [count of word id 1, count of word id 2 ... , count of word id n]
    table = {}

    # construct a table where every word ID maps to a list of 0's
    for word in vocab:
        word_id = vocab[word]
        table[word_id] = len(vocab) * [0]

    for sentence in corpus:
        length = len(sentence)
        for i in range(length):
            curr_word = sentence[i]
            for word in range( i - 4, i):
                if word >= 0 and word != i:
                    table[vocab[curr_word]][vocab[sentence[word]]] += 1
            for word in range(i + 1, i + 4 + 1):
                if word < length and word != i:
                    table[vocab[curr_word]][vocab[sentence[word]]] += 1
    return table

# iris's function
def get_emoji_list(data):
    empty = []
    for i in range(len(data)):
        tweet = re.sub(r'^https?:\/\/.*[\r\n]*', '', data[i], flags=re.MULTILINE)
        a = process_document(tweet)
        a = ' '.join(a)
        data[i] = a
        if a == '':
            empty.append(i)
    data  = list(np.delete(list(data), empty, 0))

def most_similar(lookup_table, wordvec, vocab):
    """ Helper function (optional).

        Given a lookup table and word vector, find the top most-similar word ids to the given
        word vector. You can limit this to the first NUM_CLOSEST results.
    """

    most_similar = {}
    for word in lookup_table:
        vector = lookup_table[word]
        pair = []
        pair.append(vector)
        pair.append(wordvec)
        distance = pdist(pair, 'cosine')
        most_similar[word] = distance

    sorted_most_similar = sorted(most_similar.items(), key=lambda x: x[1])
    sorted_most_similar = sorted_most_similar[1:100 + 1]
    most_similar_word = []
    for word in sorted_most_similar:
        most_similar_word.append(word[0])

    return most_similar_word

def get_wordvec_dictionary(lookup_table, wordvec, inverse_vocab):
    """ Helper function (optional).

        Given a lookup table and word vector, 
        returns a dictionary of the words and their distance from the given word
    """

    most_similar = {}
    for word in lookup_table:
        vector = lookup_table[word]
        pair = []
        pair.append(vector)
        pair.append(wordvec)
        distance = pdist(pair, 'cosine')
        most_similar[word] = distance

    sorted_most_similar = sorted(most_similar.items(), key=lambda x: x[1])
    sorted_most_similar = sorted_most_similar[1:40]
    
    word_distance = {}
    for word in sorted_most_similar:
        num = word[1][0]
        word_distance[inverse_vocab[word[0]]] = num
    return word_distance

def plot_with_labels(low_dim_embs, labels):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(
            label,
            xy=(x, y),
            xytext=(5, 2),
            textcoords='offset points',
            ha='right',
            va='bottom')
        
def main(data_dir):

    corpus = read_data(data_dir) # returns the data, one tweet per list

    # call  translate here if necessary
    new_corpus = []
    for tweet in corpus:
        new_corpus.append(process_document(tweet)) # returns a list of tweets processed
    
    corpus_2d = [] # list of lists
    for tweet in new_corpus:
        corpus_2d.append(tweet)
    

    counts = word_counts(corpus_2d)
    new_corpus, new_counts = trunc_vocab(corpus_2d, counts)
               
    vocab, inverse_vocab = construct_vocab(new_corpus)

    lookup_table= word_vectors(new_corpus, vocab)
    
    vectors = []
    for wid in lookup_table:
        vectors.append(lookup_table[wid])
    

    D = pdist(vectors, 'cosine')
    D = squareform(D)

    
    dictionary_data = get_wordvec_dictionary(lookup_table, lookup_table[0], inverse_vocab)
    translator = Translator()
    print(dictionary_data)
    
    with open('for_graphs/angry_es.json', 'wt') as out:
        res = json.dump(dictionary_data, out, indent=4, separators=(',', ': '))
    
    
#     tsne = TSNE(
#       perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
#     plot_only = 12
#     low_dim_embs = tsne.fit_transform(D[:plot_only, :])
#     labels = [inverse_vocab[i] for i in range(plot_only)]
#     plot_with_labels(low_dim_embs, labels)


# Run using 'python nlp.py' or 'python nlp.py <PATH_TO_BBC_DIRECTORY>'
# to manually specify the path to the data.
# This may take a little bit of time (~30-60 seconds) to run.
if __name__ == '__main__':
    data_dir = 'data/angry_face_es.csv'
    main(data_dir)