In [None]:
import pandas as pd
import numpy as np
import os
import codecs
import re

# Paths to Different Data
path="D:/data-mining-project/Data/"
gloveFile = 'D:/data-mining-project/Data/glove/glove_6B_100d.txt'
vocab_path = 'D:/data-mining-project/Data/glove/vocab_glove.csv'
wordVectors_path = 'D:/data-mining-project/Data/inputs_model/wordVectors.csv'
sequence_len_path = 'D:/data-mining-project/Data/inputs_model/sequence_length.csv'
train_data_path ='D:/data-mining-project/Data/TrainingData/train.csv'
val_data_path ='D:/data-mining-project/Data/TrainingData/val.csv'
sent_matrix_path ='D:/data-mining-project/Data/inputs_model/sentence_matrix.csv'
sent_matrix_path_val ='D:/data-mining-project/Data/inputs_model/sentence_matrix_val.csv'
sent_matrix_path_test ='D:/data-mining-project/Data/inputs_model/sentence_matrix_test.csv'
sequence_len_val_path = 'D:/data-mining-project/Data/inputs_model/sequence_length_val.csv'
sequence_len_test_path = 'D:/data-mining-project/Data/inputs_model/sequence_length_test.csv'
test_data_path ='D:/data-mining-project/Data/TrainingData/test.csv'

#print(path)

def maxSeqLen(training_data):

    total_words = 0
    sequence_length = []
    idx = 0
    for index, row in training_data.iterrows():

        sentence = (row['Phrase'])
        sentence_words = sentence.split(' ')
        len_sentence_words = len(sentence_words)
        total_words = total_words + len_sentence_words

        # get the length of the sequence of each training data
        sequence_length.append(len_sentence_words)

        if idx == 0:
            max_seq_len = len_sentence_words


        if len_sentence_words > max_seq_len:
            max_seq_len = len_sentence_words
        idx = idx + 1

    avg_words = total_words/index

    # convert to numpy array
    sequence_length_np = np.asarray(sequence_length)

    return max_seq_len, avg_words, sequence_length_np

def tf_data_pipeline(data, word_idx, weight_matrix, max_seq_len):
    maxSeqLength = max_seq_len #Maximum length of sentence
    no_rows = len(data)
    ids = np.zeros((no_rows, maxSeqLength), dtype='int32')
    # conver keys in dict to lower case
    word_idx_lwr =  {k.lower(): v for k, v in word_idx.items()}
    idx = 0

    for index, row in data.iterrows():


        sentence = (row['Phrase'])
        sentence_words = sentence.split(' ')

        i = 0
        for word in sentence_words:
            #print(index)
            word_lwr = word.lower()
            try:
                #print (word_lwr)
                ids[idx][i] =  word_idx_lwr[word_lwr]

            except Exception as e:
                #print (e)
                #print (word)
                if str(e) == word:
                    ids[idx][i] = 0
                continue
            i = i + 1
        idx = idx + 1
    return ids

# Create Vocab subset GLove vectors
def word_vec_index(training_data, glove_model):

    sentences = training_data['Phrase'] # get the phrases as a df series
    #sentences = sentences[0:100]
    sentences_concat = sentences.str.cat(sep=' ')
    sentence_words = re.findall(r'\S+', sentences_concat)
    sentence_words_lwr = [x.lower() for x in sentence_words]
    subdict = {word: glove_model[word] for word in glove_model.keys() & sentence_words_lwr}

    vocab_df = pd.DataFrame(subdict)
    vocab_df.to_csv(vocab_path)
    return vocab_df

# Filtered Vocabulary from Glove document
def filter_glove(full_glove_path, data_dir):
    vocab = set()
    sentence_path = os.path.join(data_dir,'SOStr.txt')
    filtered_glove_path = os.path.join(data_dir, 'filtered_glove.txt')
    with codecs.open(sentence_path, encoding='utf-8') as f:
        for line in f:
            # Drop the trailing newline and strip backslashes. Split into words.
            vocab.update(line.strip().replace('\\', '').split('|'))
    nread = 0
    nwrote = 0
    with codecs.open(full_glove_path, encoding='utf-8') as f:
        with codecs.open(filtered_glove_path, 'w', encoding='utf-8') as out:
            for line in f:
                nread += 1
                line = line.strip()
                if not line: continue
                if line.split(u' ', 1)[0] in vocab:
                    out.write(line + '\n')
                    nwrote += 1
    print('read %s lines, wrote %s' % (nread, nwrote))
    return vocab

# Combine and split the data into train and test
def read_data(path):
    df_data_sentence = pd.read_table(path + 'dictionary.txt')
    df_data_sentence_processed = df_data_sentence['Phrase|Index'].str.split('|', expand=True)
    df_data_sentence_processed = df_data_sentence_processed.rename(columns={0: 'Phrase', 1: 'phrase_ids'})
    # read sentiment labels into df
    df_data_sentiment = pd.read_table(path + 'sentiment_labels.txt')
    df_data_sentiment_processed = df_data_sentiment['phrase ids|sentiment values'].str.split('|', expand=True)
    df_data_sentiment_processed = df_data_sentiment_processed.rename(columns={0: 'phrase_ids', 1: 'sentiment_values'})
    #combine data frames containing sentence and sentiment
    df_processed_all = df_data_sentence_processed.merge(df_data_sentiment_processed, how='inner', on='phrase_ids')
    return df_processed_all

# Glove Vector
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    f = open(gloveFile,'r',encoding='utf-8')
    model = {}
    for line in f:
        try:
            splitLine = line.split()
            word = splitLine[0]
            embedding = [float(val) for val in splitLine[1:]]
            model[word] = embedding
        except:
            print (word)
            continue

    print ("Done.",len(model)," words loaded!")
    return model

# Convert df to list
def word_list(vocab_df):

    wordVectors = vocab_df.values.T.tolist()
    wordVectors_np = np.array(wordVectors)
    wordList = list(vocab_df.columns.values)

    return wordList, wordVectors_np

def training_data_split(all_data, spitPercent, data_dir):

    msk = np.random.rand(len(all_data)) < spitPercent
    train_only = all_data[msk]
    test_and_dev = all_data[~msk]


    msk_test = np.random.rand(len(test_and_dev)) <0.5
    test_only = test_and_dev[msk_test]
    dev_only = test_and_dev[~msk_test]

    dev_only.to_csv(os.path.join(data_dir, 'TrainingData/val.csv'))
    test_only.to_csv(os.path.join(data_dir, 'TrainingData/test.csv'))
    train_only.to_csv(os.path.join(data_dir, 'TrainingData/train.csv'))

    return train_only, test_only, dev_only


# main function
all_data = read_data(path)
training_data = pd.read_csv(train_data_path, encoding='iso-8859-1')

# to split the training, validation and test
train_df, test_df, dev_df = training_data_split(all_data,0.5,path)

# Load glove vector
glove_model = filter_glove(gloveFile,path)
glove_model = loadGloveModel(gloveFile)

# Get glove vector subset for training vocab
vocab_df = word_vec_index(train_df, glove_model)
glove_model = None
vocab_df = pd.read_csv(vocab_path, encoding='iso-8859-1')

#Get Wordlist and word vec lists from the df
wordList, wordVectors = word_list(vocab_df)
wordVectors_df = pd.DataFrame(wordVectors)
wordVectors_df.to_csv(wordVectors_path)

# get the index of the word vec for each sentences to be input to the tf algo
max_seq_len, avg_len, sequence_length = maxSeqLen(training_data)
sequence_length_df = pd.DataFrame(sequence_length)
sequence_length_df.to_csv(sequence_len_path)

# training data input matrix
sentence_matrix = tf_data_pipeline(training_data, vocab_df, wordVectors, max_seq_len)

# export the sentence matrix to a csv file for easy load for next iterations
sentence_matrix_df = pd.DataFrame(sentence_matrix)
sentence_matrix_df.to_csv(sent_matrix_path)

# validation data set
val_data = pd.read_csv(val_data_path, encoding='iso-8859-1')

# load glove model and generat vocab for validation data
glove_model = loadGloveModel(gloveFile)
vocab_df_val = word_vec_index(val_data, glove_model)
glove_model = None
wordList_val, wordVectors_val = word_list(vocab_df_val)

# get max length for val data
max_seq_len_val, avg_len_val, sequence_length_val = maxSeqLen(val_data)
sequence_length_val_df = pd.DataFrame(sequence_length_val)
sequence_length_val_df.to_csv(sequence_len_val_path)

# get the id matrix for val data
sentence_matrix_val = tf_data_pipeline(val_data, vocab_df_val, wordVectors_val, max_seq_len)

# write the val dat to csv
sentence_matrix_df_val = pd.DataFrame(sentence_matrix_val)
sentence_matrix_df_val.to_csv(sent_matrix_path_val)

# Test data set
test_data = pd.read_csv(test_data_path, encoding='iso-8859-1')

# load glove model and generat vocab for test data
glove_model = loadGloveModel(gloveFile)
vocab_df_test = word_vec_index(val_data, glove_model)
glove_model = None
wordList_test, wordVectors_test = word_list(vocab_df_test)

# get max length for test data
max_seq_len_test, avg_len_test, sequence_length_test = maxSeqLen(test_data)
sequence_length_test_df = pd.DataFrame(sequence_length_test)
sequence_length_test_df.to_csv(sequence_len_test_path)

# get the id matrix for test data
sentence_matrix_test = tf_data_pipeline(test_data, vocab_df_test, wordVectors_test, max_seq_len_test)

# write the test dat to csv
sentence_matrix_df_test= pd.DataFrame(sentence_matrix_test)
sentence_matrix_df_test.to_csv(sent_matrix_path_test)


In [None]:
!cd
!ls

sample_data
