# Tokenize reviews

#### Imports

In [None]:
import numpy as np

import numpy as np
import pandas as pd
import json

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import pickle 
import fugashi # pip install fugashi or pip install fugashi[unidic-lite]

#### Loading data

In [None]:
# Loading the data from previously saved file
with open(f'../data/amazon_reviews/train/data_train.json', 'rb') as train_file:
    train_prepped_data = pickle.load(train_file)
with open(f'../data/amazon_reviews/test/data_test.json', 'rb') as file:
    test_prepped_data = pickle.load(file)

#### Functions

In [None]:
def sent_encode(sent):
    '''
    Helper function to encode sentiment
    ------
    Takes in string description
        'sent' - either positive or negative
    Returns binary encoding
        1 = positive sentiment
        0 = negative sentiment
    '''
    if float(sent) > 2.5:
        return 1 # positive
    else:
        return 0 # negative

In [None]:
def clean(data, language):
    '''
    Function to clean the data
    -----
    Takes in data set from load_data() and language needed to tokenize in
        'data' - nested dictionary  
    Returns two lists
        cleaned - X list of tuples (id,[text])
        ys - y list
    '''
    cleaned = [] 
    ys = []
    tagger = fugashi.Tagger()

    for idx in data:
        review = data[idx].get('text', None) # some data does not have a review text
        summary = data[idx].get('summary', None) # some data does not have a summary 
        
        # combine summary and review
        if review == None and summary == None:
            text = ''
        elif review == None:
            text = summary
        elif summary == None:
            text = review
        else:
            text = summary + ' ' + review
        
        # tokenizing for english and french reviews
        if language == "english" or language == "french":
            text = text.lower()
            sequence = word_tokenize(text, language = language)  # splits gotta into got ta
            cleaned.append(sequence)
            # encode sentiment
            ys.append(sent_encode(data[idx]['rating']))

        # tokenizing for japanese reviews !! NOT SET UP YET !!
        elif language == "japanese":
            text = text.lower()
            sequence = [word.surface for word in tagger(text)]
            cleaned.append(sequence)
            # encode sentiment
            ys.append(sent_encode(data[idx]['rating']))

    return cleaned, ys

In [None]:
def get_vocab_corpus(dataset):
    '''
    Function computing vocabluary and corpus for a dataset
    -----
    Takes a cleaned dataset - list 
        dataset - X list 
    Returns
        vocab - set of unique tokens in dataset
        corpus - list of strings; sentences in dataset 
    '''
    vocab = set()
    corpus = []
    for text in dataset: # for list in list of lists
        sentence = ''
        for token in text: # for token in list 
            vocab.add(token)
            if token in ['.','!','?',',',';',':']:
                sentence += token 
            else:
                sentence += ' ' + token 
        corpus.append(sentence.lstrip()) 
    return vocab, corpus

#### Filtering for the first 50.000 instances

In [None]:
languages = ['en', 'jp','fr']

In [None]:
# setting the data to only include 50,000 reviews from each language
eng50000 = dict(list(train_prepped_data['en'].items())[0:50000])
frn50000 = dict(list(train_prepped_data['fr'].items())[0:50000])
jap50000 = dict(list(train_prepped_data['jp'].items())[0:50000])

In [None]:
train_en, y_train_en = clean(eng50000, "english") # english tokenized text
train_fr, y_train_fr = clean(frn50000, "french") # french tokenized text
train_jp, y_train_jp = clean(jap50000, "japanese") # japanese tokenized text 

#### Overview of positive/negative

In [None]:
positive = 0
negative = 0
for i in y_train_en:
    if i == 1:
        positive += 1
    elif i == 0:
        negative += 1 

print("Number of positive reviews (English): ", positive)
print("Number of negative reviews (English): ", negative)

In [None]:
positive = 0
negative = 0
for i in y_train_fr:
    if i == 1:
        positive += 1
    elif i == 0:
        negative += 1 

print("Number of positive reviews (French): ", positive)
print("Number of negative reviews (French): ", negative)

In [None]:
positive = 0
negative = 0
for i in y_train_jp:
    if i == 1:
        positive += 1
    elif i == 0:
        negative += 1 

print("Number of positive reviews (Japanese): ", positive)
print("Number of negative reviews (Japanese): ", negative)

#### Vocab & Corpus

In [None]:
train_vocabulary_en, train_corpus_en = get_vocab_corpus(train_en)
train_vocabulary_fr, train_corpus_fr = get_vocab_corpus(train_fr)
train_vocabulary_jp, train_corpus_jp = get_vocab_corpus(train_jp)

print("Vocab size of {} is: {} and Corpus size is: {}".format('English', len(train_vocabulary_en), len(train_corpus_en)))
print("Vocab size of {} is: {} and Corpus size is: {}".format('French', len(train_vocabulary_fr), len(train_corpus_fr)))
print("Vocab size of {} is: {} and Corpus size is: {}".format('Japanese', len(train_vocabulary_jp), len(train_corpus_jp)))