In [None]:
import pandas as pd
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation , Embedding , Flatten
from keras.callbacks import Callback
from keras.optimizers import Adam
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download("stopwords")
nltk.download('punkt')
from nltk.corpus import stopwords
import re

np.set_printoptions(threshold=np.inf)
# right now we only take 10k samples to make the testing of the code faster
sample_size = 50000
# this is the maximum number of words we take from the blogs
max_length = 500
ps = PorterStemmer()

def get_word_dict(docs_to_process):
    word_dict = {}
    for text in docs_to_process:
        words = re.findall(r"[\w']+", text)
        for word in words:
            if word in word_dict:
                word_dict[word] = word_dict[word] + 1
            else:
                word_dict[word] = 1
      
    return word_dict

def make_gender_list(gender_list):
  gender_list_binary = np.zeros(gender_list.shape[0])
  print("gender list shape: " + str(gender_list_binary.shape))
  for i in range(len(gender_list_binary)):
    if(gender_list[i] == 'male'):
      gender_list_binary[i] = 1
  return gender_list_binary


#This function removes stopwords and also words that appear infrequently, and it cuts blogposts longer than 500 words.
def reduce_vocab(docs, word_limit, sparsewords=False):
  for i,  blogpost in enumerate(docs):
    word_tokens = word_tokenize(docs[i])
    
    if(len(word_tokens) > word_limit):
      word_tokens = word_tokens[0:word_limit]
    
    blogpost_reduced = [w for w in word_tokens if not w in stop_words]
    # WORD STEMMING:
    blogpost_reduced = [ps.stem(w) for w in blogpost_reduced]
    
    if(sparsewords == True):
      blogpost_reduced = [w for w in blogpost_reduced if not w in infrequent_words ]
      
    docs[i] = ' '.join( blogpost_reduced ) #.replace(' , ',',').replace(' .','.').replace(' !','!').replace(' ?','?')


df = pd.read_json("data.json")
df.head()

stop_words = set(stopwords.words('english'))    #set of stopwords
print(stop_words)


# get sample_size amount of data from the database
values = df.values[0:sample_size]
print(values.shape)
#print(values[0:2])

docs = values[: , 2]
#print(docs.shape)
labels = values[: , 0:2]
#print(docs[0:5])
#print(labels[0:5][:])

#change database to lower case letters
for i,doc in enumerate(docs):
  docs[i] = doc.lower()


# Here we create a word dictionary of the blog posts, to see how many different
# words are in them, to decide what the vocab_size should be.
# We also check how many words we might not even need, because they appear very
# infrequently.
# We also use this to create a list of the infrequent words, so we can remove them,
# to see if it helps later in the modelling phase.


docs_to_process = docs

  
word_dict = get_word_dict(docs_to_process)

infrequent_words = []

word_dict_small = word_dict.copy()
for elem in word_dict:
    if word_dict[elem] <= 1:
      infrequent_words.append(elem)
      del word_dict_small[elem]
      
print("Size of the word dictionary: " + str(len(word_dict)))
print("Size without infrequent words: " + str(len(word_dict_small)))
#print(infrequent_words)



    
# Right now the removal of sparse/infrequent words is slow, we might have to find a different way to reduce vocabulary size
reduce_vocab(docs, max_length, sparsewords=False)
gender_list_binary = make_gender_list(labels[:,1])


final_wd = get_word_dict(docs)
vocab_size = len(docs)
encoded_docs = [keras.preprocessing.text.one_hot(d, vocab_size , filters='') for d in docs]
padded_docs = keras.preprocessing.sequence.pad_sequences(encoded_docs, maxlen=max_length, padding='post')

padded_docs_unlistified = list()
for i in range(padded_docs.shape[0]):
    padded_docs_unlistified.append(','.join(str(x) for x in padded_docs[i]))
# check is the the first list and string are the same
      

dataout = {'age' : labels[:,0] , 'gender': gender_list_binary , 'post': padded_docs_unlistified}

dfout = pd.DataFrame( data=dataout ) 
dfout.to_json('out2.json')