[View in Colaboratory](https://colab.research.google.com/github/neoaksa/tensorflowDemo/blob/master/wordSentimentDemo/wordSentiment.ipynb)

In [0]:
# authenticate google drive
from google.colab import auth
auth.authenticate_user()


In [0]:
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

In [0]:
! pip install pydrive
# these classes allow you to request the Google drive API
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

def downloadFile(inputfilename,outputfilename):
    downloaded = drive.CreateFile({'id': inputfilename})
    # assume the file is called file.csv and it's located at the root of your drive
    downloaded.GetContentFile(outputfilename)
    
# traning file download
trainingFile = downloadFile("1adyPElLZ118U1aKVEeqrsVNX4b-VoVDm","training.1600000.processed.noemoticon.csv")
# test file download
testingFile = downloadFile("1-6lzGSZ-IkIjYiUULuhpoADPe55aSWcR","testdata.manual.2009.06.14.csv")

In [0]:
# # Download the file we just uploaded.
# #
# # Replace the assignment below with your file ID
# # to download a different file.
# #
# # use native google drive API

# import io
# from googleapiclient.http import MediaIoBaseDownload


# def downloadFile(file_id):
#     request = drive_service.files().get_media(fileId=file_id)
#     downloaded = io.BytesIO()
#     downloader = MediaIoBaseDownload(downloaded, request)
#     done = False
#     while done is False:
#         # _ is a placeholder for a progress object that we ignore.
#         # (Our file is small, so we skip reporting progress.)
#         _, done = downloader.next_chunk()

#     downloaded.seek(0)
#     return downloaded
    
# # traning file download
# trainingFile = downloadFile("1adyPElLZ118U1aKVEeqrsVNX4b-VoVDm")
# # test file download
# testingFile = downloadFile("1-6lzGSZ-IkIjYiUULuhpoADPe55aSWcR")

In [0]:
# prepare data
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pickle
import numpy as np
import pandas as pd

lemmatizer = WordNetLemmatizer()

def init_process(fin, fout):
    outfile = open(fout, 'a')
    with open(fin, buffering=200000, encoding='latin-1') as f:
        try:
            # replace special characters
            for line in f:
                line = line.replace('"', '')
                # the first column is label
                initial_polarity = line.split(',')[0]
                # 0=negative 4=positive
                if initial_polarity == '0':
                    initial_polarity = [1, 0]
                elif initial_polarity == '4':
                    initial_polarity = [0, 1]
                # the last column is input
                tweet = line.split(',')[-1]
                outline = str(initial_polarity) + ':::' + tweet
                outfile.write(outline)
        except Exception as e:
            print(str(e))
    outfile.close()

# save for training and testing dataset
init_process('training.1600000.processed.noemoticon.csv', 'train_set.csv')
init_process('testdata.manual.2009.06.14.csv', 'test_set.csv')
# download and check the pre-processing file
# from google.colab import files
# files.download('test_set.csv')
# files.download('train_set.csv')

In [0]:
# if lack of some componets please run this chuck
nltk.download('punkt')
nltk.download('wordnet')

In [0]:

# create lexicon
def create_lexicon(fin):
    lexicon = []
    with open(fin, 'r', buffering=100000, encoding='latin-1') as f:
        try:
            counter = 1
            content = ''
            for line in f:
                counter += 1
                # randomly pick up line for sampling to lexicon
                if (counter / 2500.0).is_integer():
                    tweet = line.split(':::')[1]
                    content += ' ' + tweet
                    words = word_tokenize(content)
                    words = [lemmatizer.lemmatize(i) for i in words]
                    lexicon = list(set(lexicon + words))
                    print(counter, len(lexicon))

        except Exception as e:
            print(str(e))

    with open('lexicon.pickle', 'wb') as f:
        pickle.dump(lexicon, f)

# create lexicon and save to pickle
create_lexicon('train_set.csv')

# convert dataset input to vector
def convert_to_vec(fin, fout, lexicon_pickle):
    # open lexicon
    with open(lexicon_pickle, 'rb') as f:
        lexicon = pickle.load(f)
    outfile = open(fout, 'a')
    with open(fin, buffering=20000, encoding='latin-1') as f:
        counter = 0
        for line in f:
            counter += 1
            label = line.split(':::')[0]
            tweet = line.split(':::')[1]
            # tokenize to array
            current_words = word_tokenize(tweet.lower())
            # lemmatize for each element
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            # one-hot coding
            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    # OR DO +=1, test both
                    features[index_value] += 1

            features = list(features)
            outline = str(features) + '::' + str(label) + '\n'
            outfile.write(outline)

        print(counter)


convert_to_vec('test_set.csv', 'processed-test-set.csv', 'lexicon.pickle')


def shuffle_data(fin):
    df = pd.read_csv(fin, error_bad_lines=False)
    df = df.iloc[np.random.permutation(len(df))]
    print(df.head())
    df.to_csv('train_set_shuffled.csv', index=False)


shuffle_data('train_set.csv')

# split the csv into x, y dataset
def create_test_data_pickle(fin):
    feature_sets = []
    labels = []
    counter = 0
    with open(fin, buffering=20000) as f:
        for line in f:
            try:
                features = list(eval(line.split('::')[0]))
                label = list(eval(line.split('::')[1]))

                feature_sets.append(features)
                labels.append(label)
                counter += 1
            except:
                pass
    print(counter)
    feature_sets = np.array(feature_sets)
    labels = np.array(labels)

create_test_data_pickle('processed-test-set.csv')

In [0]:
# the following step will take very long time, so we need to check if we use GPU
import tensorflow as tf
tf.test.gpu_device_name()

from tensorflow.python.client import device_lib
device_lib.list_local_devices()

In [0]:
"""
this snippet is used for build ANN net work, train the model and test it
before you run this snippet, you should run word2vec.py first which transfer
training and testing data to one-hot code vector.
"""

import tensorflow as tf
import pickle
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


lemmatizer = WordNetLemmatizer()


# setting ANN variables
with open("lexicon.pickle", 'rb') as f:
    lexicon = pickle.load(f)
input_size = len(lexicon)
output_size = 2
structure = [input_size,500,500,output_size]
x = tf.placeholder(dtype='float',shape=[None,structure[0]])
y = tf.placeholder(dtype='float')
batch_size = 30
epoch_max = 10
total_batches = int(1600000 / batch_size)

# feedforward Model
def netural_network_model(x):
    # l is output value from active function
    l = 0
    l_prev = 0
    for i in range(1,len(structure)):
        # create each layer structure
        hidden_layer = {'weight':tf.Variable(tf.random_normal([structure[i-1],structure[i]])),
                        'biases':tf.Variable(tf.random_normal([structure[i]]))}
        # input layer--> first hidden layer
        if i == 1:
            l = tf.add(tf.matmul(x, hidden_layer['weight']),hidden_layer['biases'])
            l = tf.nn.relu(l)
            l_prev = l
        # --->output layer without active function
        elif i == len(structure)-1:
            l = tf.add(tf.matmul(l_prev, hidden_layer['weight']), hidden_layer['biases'])
        # hidden layer ---> hidden layer with Relu active function
        else:
            l = tf.add(tf.matmul(l_prev, hidden_layer['weight']), hidden_layer['biases'])
            l = tf.nn.relu(l)
            l_prev = l
    return l


tf_log = 'tf.log'


def train_neural_network(x):
    # prediction, cost and gradient decrease
    prediction = netural_network_model(x)
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=prediction, labels=y))
    optimizer = tf.train.AdamOptimizer().minimize(cost)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        try:
            epoch = int(open(tf_log, 'r').read().split('\n')[-2]) + 1
            print('STARTING:', epoch)
        except:
            epoch = 1

        while epoch <= epoch_max:
            # continue the session data
            if epoch != 1:
                saver.restore(sess, "./model.ckpt")
            epoch_loss = 1
            # load the prepared lexicon
            with open('lexicon.pickle', 'rb') as f:
                lexicon = pickle.load(f)
            # load shuffled traning dataset
            with open('train_set_shuffled.csv', buffering=20000, encoding='latin-1') as f:
                batch_x = []
                batch_y = []
                batches_run = 0
                for line in f:
                    label = line.split(':::')[0]
                    tweet = line.split(':::')[1]
                    current_words = word_tokenize(tweet.lower())
                    current_words = [lemmatizer.lemmatize(i) for i in current_words]

                    features = np.zeros(len(lexicon))

                    for word in current_words:
                        if word.lower() in lexicon:
                            index_value = lexicon.index(word.lower())
                            # OR DO +=1, test both
                            features[index_value] += 1
                    line_x = list(features)
                    line_y = eval(label)
                    batch_x.append(line_x)
                    batch_y.append(line_y)
                    if len(batch_x) >= batch_size:
                        _, c = sess.run([optimizer, cost], feed_dict={x: np.array(batch_x),
                                                                      y: np.array(batch_y)})
                        epoch_loss += c
                        batch_x = []
                        batch_y = []
                        batches_run += 1
                        # print('Batch run:', batches_run, '/', total_batches, '| Epoch:', epoch, '| Batch Loss:', c, )
            # save session for each epoch
            saver.save(sess, "./model.ckpt")
            print('Epoch', epoch, 'completed out of', epoch_max, 'loss:', epoch_loss)
            with open(tf_log, 'a') as f:
                f.write(str(epoch) + '\n')
            epoch += 1


train_neural_network(x)

In [0]:
from functools import singledispatch

@singledispatch
def test_neural_network():
    prediction = netural_network_model(x)
    with tf.Session() as sess:
        sess.run(sess.run(tf.global_variables_initializer()))
        saver = tf.train.Saver()
        # load the traning session for model
        try:
            saver.restore(sess, "model.ckpt")
        except Exception as e:
            print(str(e))

        correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
        # be ware there is no optimizer, since we dont need backprepergate
        feature_sets = []
        labels = []
        counter = 0
        # load test dataset
        with open('processed-test-set.csv', buffering=20000) as f:
            for line in f:
                try:
                    features = list(eval(line.split('::')[0]))
                    label = list(eval(line.split('::')[1]))
                    feature_sets.append(features)
                    labels.append(label)
                    counter += 1
                except:
                    pass
        print('Tested', counter, 'samples.')
        test_x = np.array(feature_sets)
        test_y = np.array(labels)
        print('Accuracy:', accuracy.eval({x: test_x, y: test_y}))

test_neural_network()

@test_neural_network.register(str)
def _(input_data):
    prediction = netural_network_model(x)
    with tf.Session() as sess:
        sess.run(sess.run(tf.global_variables_initializer()))
        saver = tf.train.Saver()
        with open('lexicon.pickle', 'rb') as f:
            lexicon = pickle.load(f)
        # load the session
        try:
            saver.restore(sess, "model.ckpt")
        except Exception as e:
            print(str(e))
        # lemmatize the input data
        current_words = word_tokenize(input_data.lower())
        current_words = [lemmatizer.lemmatize(i) for i in current_words]
        # one hot coding
        features = np.zeros(len(lexicon))
        for word in current_words:
            if word.lower() in lexicon:
                index_value = lexicon.index(word.lower())
                # OR DO +=1, test both
                features[index_value] += 1
        features = np.array(list(features))
        # pos: [1,0] , argmax: 0
        # neg: [0,1] , argmax: 1
        result = (sess.run(tf.argmax(prediction.eval(feed_dict={x: [features]}), 1)))
        if result[0] == 0:
            print('Positive:', input_data)
        elif result[0] == 1:
            print('Negative:', input_data)

test_neural_network("I hate you")