In [2]:
import pandas as pd
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation , Embedding , Flatten, Dropout, Convolution1D
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.preprocessing import sequence
from keras.optimizers import Adam
import re
from keras.optimizers import SGD

#from keras.callbacks import TensorBoard
# males -> 1
# females -> 0

#read processed data
df_processed = pd.read_json("out2.json")
print("df head: " + str(df_processed.head()))

values_processed = df_processed.values
docs_padded_string = values_processed[:,2]
# calculate max_length based on the first entry of the 'post' column's encoded string (it conteins the encoded ints
# separated by ','-s so it will have to be split and converted back to int before use.)
max_length = len(docs_padded_string[0].split(','))
print("max_length: " + str(max_length))
padded_docs = np.zeros((len(docs_padded_string),max_length))
# convert the string to a list of ints by splitting them along ',' characters:
for i in range(len(docs_padded_string)):
    padded_docs[i] = [int(x) for x in docs_padded_string[i].split(",")]
    
#print("padded docs string: " + str(docs_padded_string[0]))
#print("padded docs: " + str(padded_docs[0]))
gender_list = values_processed[: , 1]
# check finalized word_dict:


# get valid/test/train split
valid_split = 0.2
test_split = 0.1
nb_samples = values_processed.shape[0]


X_train = padded_docs[0:int(nb_samples*(1-valid_split-test_split))]
Y_train = gender_list[0:int(nb_samples*(1-valid_split-test_split))]
X_valid = padded_docs[int(nb_samples*(1-valid_split-test_split)):int(nb_samples*(1-test_split))]
Y_valid = gender_list[int(nb_samples*(1-valid_split-test_split)):int(nb_samples*(1-test_split))]
X_test  = padded_docs[int(nb_samples*(1-test_split)):]
Y_test  = gender_list[int(nb_samples*(1-test_split)):]

# check shapes
print("X_train shape: " + str(X_train.shape))
print("Y_train shape: " + str(Y_train.shape))
print("X_valid shape: " + str(X_valid.shape))
print("Y_valid shape: " + str(Y_valid.shape))# define the model

vocab_size=len(padded_docs)


patience=20
early_stopping=EarlyStopping(patience=patience, verbose=1)
checkpointer=ModelCheckpoint(filepath='weights.hdf5', save_best_only=True, verbose=1)

model = Sequential()
model.add(Embedding(vocab_size, 64, input_length=max_length))

# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(256, 5, padding='same'))
model.add(Convolution1D(128, 5, padding='same'))
model.add(Convolution1D(64, 5, padding='same'))
model.add(Convolution1D(32, 5, padding='same'))
model.add(Convolution1D(16, 5, padding='same'))
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(180, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))

sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.8, nesterov=True)
# compile the model
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
print("X_test shape: " + str(X_test.shape))
print("Y_test shape: " + str(Y_test.shape))

model.fit(X_train, Y_train, epochs=100, batch_size=64, verbose = 1, validation_data=(X_valid,Y_valid), \
          shuffle=True, callbacks=[checkpointer, early_stopping])

# Evaluation on the test set
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

df head:       age  gender                                               post
0      27       1  46408,46022,46703,11417,43813,49750,5081,20544...
1      25       1  13387,42001,39064,25330,28879,29212,25249,9543...
10     37       0  43492,18424,18424,18424,18424,32566,4029,5244,...
100    27       1  37946,45632,45632,20041,17489,12804,35990,4503...
1000   26       1  34086,43188,31677,5081,28821,31677,7087,24819,...
max_length: 500
X_train shape: (35000, 500)
Y_train shape: (35000,)
X_valid shape: (10000, 500)
Y_valid shape: (10000,)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 64)           3200000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 256)          82176     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 500, 128)          163968