In [73]:
########################################
## import packages
########################################
from __future__ import print_function
import neat
import visualize
import os
import re
import csv
import sys
from datetime import datetime
import codecs
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.metrics import roc_auc_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Flatten, Dense, Input, LSTM, Embedding, Dropout, Activation, SpatialDropout1D
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras.layers import Bidirectional, GlobalMaxPool1D,GlobalMaxPooling1D,GlobalAveragePooling1D ,Conv1D, MaxPooling1D, GRU,CuDNNLSTM,CuDNNGRU, Reshape, MaxPooling1D,AveragePooling1D
from keras.optimizers import RMSprop, SGD
from keras.utils import to_categorical

import colorama
from colorama import Fore


from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt

########################################
## set directories and parameters
########################################


from keras import backend as K
from keras.engine.topology import Layer
#from keras import initializations
from keras import initializers, regularizers, constraints

In [75]:
# Program Parameters
TRAIN_DATA_FILE = "train.csv"
ga_config = "ga_config.txt" # Parameters for the genetic algorithm
max_features = 250000 # Maximum Number of Words in Dictionary
maxlen = 300   # Maximum Sequence Size

# Genetic Algorithm Hyperparameters
max_generations = 300
limit = 100 # limit number of rows to train per network
# Note: limit must match fitness_threshold in ga_config

In [17]:
# Load Files
print ("Loading Data from " + str(TRAIN_DATA_FILE))
final_validation_file = pd.read_csv(TRAIN_DATA_FILE)[:176230]  # (176230)
print(".")
test_df = pd.read_csv(TRAIN_DATA_FILE)[176231:511230] # (335000)
print(".")
train_df = pd.read_csv(TRAIN_DATA_FILE)[511231:1672297]
print("done!")
total_rows = len(train_df.index)


Loading Data from train.csv
.
.
done!


In [18]:
# Data preview
print(train_df.head())
print("Total rows: ", total_rows)

                                               ReviewText  Rating
511231  Only had it for 2 weeks, but so far it seems f...       4
511232  Great to deal with this company.  Turned out m...       5
511233  It does the job for a very low price what more...       5
511234  My brother will be very happy, I gave him my H...       5
511235  I do not subject my battery to a lot of wear a...       2
Total rows:  1161066


In [19]:
print("Prepare Data ")
print(".")
list_sentences_train = train_df["ReviewText"].fillna("NA").values
list_classes = ["negative", "somewhat negative", "neutral", "somewhat positive", "positive"]
num_classes=5
print(".")
#y = train_df[list_classes].values
target=train_df['Rating'].values
y1=to_categorical(target)
y=np.delete(y1, 0, axis=1)
print(".")
list_sentences_test = test_df["ReviewText"].fillna("NA").values
yaux=y[:,[0]]
print("done!")


Prepare Data 
.
.
.
done!


In [30]:
# Split training sentences into arrays of words 

comments = []
for text in list_sentences_train:
    comments.append(text)
    

In [31]:
# Split test sentences into arrays of words 

test_comments=[]
for text in list_sentences_test:
    test_comments.append(text)

In [32]:
print("Create an index and summary of words in word arrays")
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(comments + test_comments)

print("Convert arrays of words into bag-of-word arrays")
sequences = tokenizer.texts_to_sequences(comments)
test_sequences = tokenizer.texts_to_sequences(test_comments)


Create an index and summary of words in word arrays
Convert arrays of words into bag-of-word arrays


In [33]:
print("Print a summary of data")
print(len(sequences), 'train sequences')
print(len(test_sequences), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, sequences)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, test_sequences)), dtype=int)))
print(len(sequences), 'train sequences')
print(len(test_sequences), 'test sequences')
print('Max train sequence length: {}'.format(np.max(list(map(len, sequences)))))
print('Max test sequence length: {}'.format(np.max(list(map(len, test_sequences)))))
print(len(sequences), 'train sequences')
print(len(test_sequences), 'test sequences')
print('Min train sequence length: {}'.format(np.min(list(map(len, sequences)))))
print('Min test sequence length: {}'.format(np.min(list(map(len, test_sequences)))))

Print a summary of data
1161066 train sequences
334999 test sequences
Average train sequence length: 119
Average test sequence length: 118
1161066 train sequences
334999 test sequences
Max train sequence length: 6344
Max test sequence length: 5626
1161066 train sequences
334999 test sequences
Min train sequence length: 0
Min test sequence length: 0


In [41]:
print("Example conversion of text to padded Sequence:")
print(comments[0])
print(data[0])

Example conversion of text to padded Sequence:
Only had it for 2 weeks, but so far it seems functional. While editing photos (pushing the computer hard) the battery lasts 2.5'ish hours.Placed it in a Dv6 1030 laptop.
[     0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
  

In [36]:
# Name word index
word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

# Format data so that it'll fit into the neural network
data = pad_sequences(sequences, maxlen=maxlen)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

test_data = pad_sequences(test_sequences, maxlen=maxlen)
print('Shape of test_data tensor:', test_data.shape)

Found 508584 unique tokens
Shape of data tensor: (1161066, 300)
Shape of label tensor: (1161066, 5)
Shape of test_data tensor: (334999, 300)


In [58]:
# Defines the fitness function for a genome as the accuracy of its neural network
# Evaluates the fitness by building and testing the evolved neural network

def eval_genomes(genomes, config):
    for genome_id, genome in genomes:
        genome.fitness = limit # max possible fitness 
        net = neat.nn.FeedForwardNetwork.create(genome, config)
        for xi, xo in zip(data[:limit], y[:limit]):
            output = net.activate(xi)
            genome.fitness -= (output[0] - xo[0]) ** 2


In [66]:
# Runs the genetic algorithm
def run_ga(config_file):
    # Load configuration.
    config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                         neat.DefaultSpeciesSet, neat.DefaultStagnation,
                         config_file)

    # Create the population, which is the top-level object for a NEAT run.
    p = neat.Population(config)

    # Add a stdout reporter to show progress in the terminal.
    p.add_reporter(neat.StdOutReporter(True))
    stats = neat.StatisticsReporter()
    p.add_reporter(stats)
    p.add_reporter(neat.Checkpointer(5))

    # Run for up to 300 generations.
    winner = p.run(eval_genomes, max_generations)

    # Display the winning genome (Very ugly)
    # print('\nBest genome:\n{!s}'.format(winner))

    # Show output of the most fit genome against training data.
    print('\nOutput:')
    winner_net = neat.nn.FeedForwardNetwork.create(winner, config)
    for xi, xo in zip(data[:limit], y[:limit]):
        output = winner_net.activate(xi)
        print("input {!r}, expected output {!r}, got {!r}".format(xi, xo, output))

#     visualize.draw_net(config, winner, True)
#     visualize.plot_stats(stats, ylog=False, view=True)
#     visualize.plot_species(stats, view=True)

#     p = neat.Checkpointer.restore_checkpoint('neat-checkpoint-4')
    p.run(eval_genomes, 10)

In [None]:
run_ga(ga_config)


 ****** Running generation 0 ****** 

Population's average fitness: 49.52667 stdev: 9.96306
Best fitness: 75.00000 - size: (5, 1500) - species 1 - id 19
Average adjusted fitness: 0.491
Mean genetic distance 1.287, standard deviation 0.195
Population of 150 members in 1 species:
   ID   age  size  fitness  adj fit  stag
     1    0   150     75.0    0.491     0
Total extinctions: 0
Generation time: 46.077 sec

 ****** Running generation 1 ****** 

Population's average fitness: 60.04664 stdev: 7.56955
Best fitness: 76.00000 - size: (6, 1497) - species 1 - id 213
Average adjusted fitness: 0.591
Mean genetic distance 1.360, standard deviation 0.199
Population of 150 members in 1 species:
   ID   age  size  fitness  adj fit  stag
     1    1   150     76.0    0.591     0
Total extinctions: 0
Generation time: 43.485 sec (44.781 average)

 ****** Running generation 2 ****** 

Population's average fitness: 67.98581 stdev: 6.20304
Best fitness: 82.00000 - size: (5, 1476) - species 1 - id 302
A