In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.utils import resample
import re

In [None]:
def read_data(filepath, delim='\t'):
    
    return pd.read_csv(filepath, delimiter=delim)

In [None]:
def clean_text(df):

    # df.Phrase.replace({r"[^a-zA-Z' ]+": ''}, regex=True, inplace=True)
    # df.Phrase = df.Phrase.str.lower()
    
    return df

In [2]:
def balance_data(reviews, sentiment):
    
    reviews = np.array(reviews)
    sentiment = np.array(sentiment)
    
    cnt = Counter()
    for i in sentiment:
        cnt[i] += 1
    
    top_class, top_count = cnt.most_common()[0]
    
    balanced_rev = []
    balanced_sent = []
    for i in np.unique(sentiment):
        if i == top_class:
            balanced_rev.append(reviews[sentiment == i])
            balanced_sent.append(sentiment[sentiment == i])
        else:
            balanced_rev.append(resample(reviews[sentiment == i], n_samples=top_count))
            balanced_sent.append(resample(sentiment[sentiment == i], n_samples=top_count))
        
    balanced_rev = np.hstack(balanced_rev)
    balanced_sent = np.hstack(balanced_sent)
    
    return list(balanced_rev), list(balanced_sent)

In [3]:
def gen_dataset(filepath, delim='\t', balancing=True, top_words=None, pad_len=None):
    
    # Data Clean Up
    df = read_data(filepath, delim)
    # df = clean_text(df)
    
    reviews = df['text'].tolist()
    sentiment = df['sentiment'].tolist()
    #if balancing:
    #    reviews, sentiment = balance_data(reviews, sentiment)
    
    # Remove exceding spaces and stopwords
    # reviews = [' ' . join(r.split()) for r in reviews]
    
    #cleaned_reviews = []
    #for r in reviews:
    #    aux = []
    #    for word in r.split():
    #        if not word in ENGLISH_STOP_WORDS:
    #            aux.append(word)
    #    cleaned_reviews.append(' '.join(aux))
    #
    #reviews = cleaned_reviews
    
    # Word frequency
    word_freq = Counter() 
    for review in reviews:
        for word in str(review).split():
            if not word in word_freq:
                word_freq[word] = 0
            word_freq[word] += 1
    # print(word_freq)
    # Map word to id
    word_to_id = {}
    if top_words:
        top = top_words
        most_common_words = word_freq.most_common(top_words)
        print(len(most_common_words))
        print(len(range(top_words)))
        for i in range(top_words):
            word_to_id[most_common_words[i][0]] = top
            top -= 1
    else:
        top = len(word_freq)
        most_common_words = word_freq.most_common()

        for i in range(len(word_freq)):
            word_to_id[most_common_words[i][0]] = top
            top -= 1
    
    np.save("word_to_id.npy", word_to_id)
    
    # Convert reviews
    max_len = 0
    
    X_data = []
    for review in reviews:
        # Get max review length
        if len(str(review).split()) > max_len:
            max_len = len(str(review).split())
        
        aux = []
        for word in str(review).split():
            if not word in word_to_id:
                aux.append(0)
            else:
                aux.append(word_to_id[word])
        
        X_data.append(aux)
    
    padding_size = 0
    if pad_len:
        padding_size = pad_len
    else:
        padding_size = max_len
    print(to_categorical(sentiment))
    return pad_sequences(X_data, padding_size, padding='post'), to_categorical([(s + 1) / 2 for s in sentiment])

In [None]:
def transform_review(review, word_to_id, pad_len=None):
    
    data = []
    
    review = " ".join(re.split("[^a-zA-Z]*", review))
    review = review.lower()
    
    aux = []
    for word in str(review).split():
        if not word in word_to_id:
            aux.append(0)
        else:
            aux.append(word_to_id[word])
    
    data.append(aux)
    
    if pad_len:
        data = pad_sequences(data, pad_len, padding='post')
    else:
        data = np.array(data)
    
    return data

In [None]:
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('Verdadeiro')
    plt.xlabel('Predito')