# Tokenize reviews

#### Imports

In [1]:
data_save = False
import numpy as np
import pandas as pd
import json

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import pickle

import fugashi # pip install fugashi or pip install fugashi[unidic-lite]

#### Loading data

In [2]:
# Loading the data from previously saved file
with open(f'../data/amazon_reviews/train/data_train.json', 'rb') as train_file:
    train_prepped_data = pickle.load(train_file)
with open(f'../data/amazon_reviews/test/data_test.json', 'rb') as file:
    test_prepped_data = pickle.load(file)

#### Functions

In [3]:
def sent_encode(sent):
    '''
    Helper function to encode sentiment
    ------
    Takes in string description
        'sent' - either positive or negative
    Returns binary encoding
        1 = positive sentiment
        0 = negative sentiment
    '''
    if float(sent) > 2.5:
        return 1 # positive
    else:
        return 0 # negative

In [4]:
def clean(data, language):
    '''
    Function to clean the data
    -----
    Takes in data set from load_data() and language needed to tokenize in
        'data' - nested dictionary  
    Returns two lists
        cleaned - X list of tuples (id,[text])
        ys - y list
    '''
    cleaned = [] 
    ys = []
    tagger = fugashi.Tagger()

    for idx in data:
        review = data[idx].get('text', None) # some data does not have a review text
        summary = data[idx].get('summary', None) # some data does not have a summary 
        
        # combine summary and review
        if review == None and summary == None:
            text = ''
        elif review == None:
            text = summary
        elif summary == None:
            text = review
        else:
            text = summary + ' ' + review
        
        # tokenizing for english and french reviews
        if language == "english" or language == "french":
            text = text.lower()
            sequence = word_tokenize(text, language=language)  # splits gotta into got ta
            cleaned.append(sequence)
            # encode sentiment
            ys.append(sent_encode(data[idx]['rating']))

        # tokenizing for japanese reviews
        elif language == "japanese":
            text = text.lower()
            sequence = [word.surface for word in tagger(text)]
            cleaned.append(sequence)
            # encode sentiment
            ys.append(sent_encode(data[idx]['rating']))

    return cleaned, ys

In [5]:
def get_vocab_corpus(dataset):
    '''
    Function computing vocabluary and corpus for a dataset
    -----
    Takes a cleaned dataset - list 
        dataset - X list 
    Returns
        vocab - set of unique tokens in dataset
        corpus - list of strings; sentences in dataset 
    '''
    vocab = set()
    corpus = []
    for text in dataset: # for list in list of lists
        sentence = ''
        for token in text: # for token in list 
            vocab.add(token)
            if token in ['.','!','?',',',';',':']:
                sentence += token 
            else:
                sentence += ' ' + token 
        corpus.append(sentence.lstrip()) 
    return vocab, corpus

### Cleaning the datasets 

In [6]:
train_en, y_train_en = clean(train_prepped_data['en'], "english") # english tokenized text
train_fr, y_train_fr = clean(train_prepped_data['fr'], "french") # french tokenized text
train_jp, y_train_jp = clean(train_prepped_data['jp'], "japanese") # japanese tokenized text 

KeyboardInterrupt: 

In [None]:
test_en, y_test_en = clean(test_prepped_data['en'], "english") # english tokenized text
test_fr, y_test_fr = clean(test_prepped_data['fr'], "french") # french tokenized text
test_jp, y_test_jp = clean(test_prepped_data['jp'], "japanese") # japanese tokenized text 

#### Overview of positive/negative

In [None]:
def count_sent(y):
    positive = 0
    negative = 0
    for i in y:
        if i == 1:
            positive += 1
        elif i == 0:
            negative += 1 
    return positive,negative

In [None]:
pos,neg = count_sent(y_train_en)

print("Number of positive reviews (English): ", pos)
print("Number of negative reviews (English): ", neg)

In [None]:
pos,neg = count_sent(y_train_fr)

print("Number of positive reviews (French): ", pos)
print("Number of negative reviews (French): ", neg)

In [None]:
pos,neg = count_sent(y_train_jp)

print("Number of positive reviews (Japanese): ", pos)
print("Number of negative reviews (Japanese): ", neg)

#### Vocab & Corpus
Train

In [None]:
train_vocabulary_en, train_corpus_en = get_vocab_corpus(train_en)
train_vocabulary_fr, train_corpus_fr = get_vocab_corpus(train_fr)
train_vocabulary_jp, train_corpus_jp = get_vocab_corpus(train_jp)

print("Vocab size of {} is: {} and Corpus size is: {}".format('English', len(train_vocabulary_en), len(train_corpus_en)))
print("Vocab size of {} is: {} and Corpus size is: {}".format('French', len(train_vocabulary_fr), len(train_corpus_fr)))
print("Vocab size of {} is: {} and Corpus size is: {}".format('Japanese', len(train_vocabulary_jp), len(train_corpus_jp)))

Test

In [None]:
test_vocabulary_en, test_corpus_en = get_vocab_corpus(test_en)
test_vocabulary_fr, test_corpus_fr = get_vocab_corpus(test_fr)
test_vocabulary_jp, test_corpus_jp = get_vocab_corpus(test_jp)

print("Vocab size of {} is: {} and Corpus size is: {}".format('English', len(test_vocabulary_en), len(test_corpus_en)))
print("Vocab size of {} is: {} and Corpus size is: {}".format('French', len(test_vocabulary_fr), len(test_corpus_fr)))
print("Vocab size of {} is: {} and Corpus size is: {}".format('Japanese', len(test_vocabulary_jp), len(test_corpus_jp)))

#### Save to file 

In [None]:
save_train = {'en':train_en, 'jp':train_jp, 'fr':train_fr}
save_corpus = {'en':train_corpus_en, 'fr':train_corpus_fr, 'jp':train_corpus_jp}
save_vocab = {'en':train_vocabulary_en, 'fr':train_vocabulary_fr, 'jp':train_vocabulary_jp}

In [None]:
languages = ['en', 'jp','fr']
if data_save == True:
    for i in languages:
        with open(f'../data/amazon_reviews/train/processed_data/train_tokens_{i}.pickle', 'wb') as file:
            pickle.dump(save_train[i], file)
        with open(f'../data/amazon_reviews/train/processed_data/train_corpus_{i}.pickle', 'wb') as file:
            pickle.dump(save_corpus[i], file)
        with open(f'../data/amazon_reviews/train/processed_data/train_vocab_{i}.pickle', 'wb') as file:
            pickle.dump(save_vocab[i], file)
else:
    print('Did nothing blud')

In [None]:
save_test = {'en':test_en, 'jp':test_jp, 'fr':test_fr}
save_corpus_test = {'en':test_corpus_en, 'fr':test_corpus_fr, 'jp':test_corpus_jp}
save_vocab_test = {'en':test_vocabulary_en, 'fr':test_vocabulary_fr, 'jp':test_vocabulary_jp}

In [None]:
languages = ['en', 'jp','fr']
if data_save == True:
    for i in languages:
        with open(f'../data/amazon_reviews/test/processed_data/test_tokens_{i}.pickle', 'wb') as file:
            pickle.dump(save_test[i], file)
        with open(f'../data/amazon_reviews/test/processed_data/test_corpus_{i}.pickle', 'wb') as file:
            pickle.dump(save_corpus_test[i], file)
        with open(f'../data/amazon_reviews/test/processed_data/test_vocab_{i}.pickle', 'wb') as file:
            pickle.dump(save_vocab_test[i], file)
else:
    print('Did nothing blud')