Skip to content

Commit

Permalink
Update cnn.py & lstm.py
Browse files Browse the repository at this point in the history
  • Loading branch information
pinkeshbadjatiya committed Jan 15, 2017
1 parent 4bfdbd3 commit 57e60e0
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 67 deletions.
165 changes: 100 additions & 65 deletions cnn.py
@@ -1,11 +1,10 @@
from data_handler import get_data
from keras.preprocessing.text import Tokenizer
import argparse
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, LSTM
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D, GlobalMaxPooling1D
import numpy as np
from preprocess_twitter import tokenize as tokenizer_g
import pdb
from nltk import tokenize
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
Expand All @@ -21,48 +20,42 @@
from batch_gen import batch_gen
import sys

# Load the orginal glove file
# SHASHANK files
#GLOVE_MODEL_FILE="/home/shashank/data/embeddings/GloVe/glove-twitter25-w2v"
from nltk import tokenize as tokenize_nltk
from my_tokenizer import glove_tokenize


### Preparing the text data
texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
label_map = {
'none': 0,
'racism': 1,
'sexism': 2
}
tweet_data = get_data()
for tweet in tweet_data:
texts.append(tweet['text'])
labels.append(label_map[tweet['label']])
print('Found %s texts. (samples)' % len(texts))

EMBEDDING_DIM = int(sys.argv[1])

# Load the orginal glove file
# SHASHANK files
GLOVE_MODEL_FILE="/home/shashank/DL_NLP/glove-twitter" + str(EMBEDDING_DIM) + "-w2v"


# PINKESH files
#GLOVE_MODEL_FILE="/home/pinkesh/DATASETS/glove-twitter/GENSIM.glove.twitter.27B." + str(EMBEDDING_DIM) + "d.txt"
NO_OF_CLASSES=3

MAX_NB_WORDS = None
VALIDATION_SPLIT = 0.2
word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE)


# vocab generation
MyTokenizer = tokenize.casual.TweetTokenizer(strip_handles=True, reduce_len=True)
vocab, reverse_vocab = {}, {}
freq = defaultdict(int)
tweets = {}
np.random.seed(42)



EMBEDDING_DIM = None
GLOVE_MODEL_FILE = None
NO_OF_CLASSES=3

SEED = 42
NO_OF_FOLDS = 10
CLASS_WEIGHT = None
LOSS_FUN = None
OPTIMIZER = None
TOKENIZER = None
INITIALIZE_WEIGHTS_WITH = None
LEARN_EMBEDDINGS = None
EPOCHS = 10
BATCH_SIZE = 128
SCALE_LOSS_FUN = None


word2vec_model = None



def get_embedding(word):
#return
Expand Down Expand Up @@ -94,22 +87,21 @@ def select_tweets():
tweet_return = []
for tweet in tweets:
_emb = 0
words = Tokenize(tweet['text']).split()
words = TOKENIZER(tweet['text'].lower())
for w in words:
if w in word2vec_model: # Check if embeeding there in GLove model
_emb+=1
if _emb: # Not a blank tweet
tweet_return.append(tweet)
print 'Tweets selected:', len(tweet_return)
#pdb.set_trace()
return tweet_return


def gen_vocab():
# Processing
vocab_index = 1
for tweet in tweets:
text = Tokenize(tweet['text'])
text = TOKENIZER(tweet['text'].lower())
text = ''.join([c for c in text if c not in punctuation])
words = text.split()
words = [word for word in words if word not in STOPWORDS]
Expand All @@ -122,12 +114,10 @@ def gen_vocab():
freq[word] += 1
vocab['UNK'] = len(vocab) + 1
reverse_vocab[len(vocab)] = 'UNK'
#pdb.set_trace()


def filter_vocab(k):
global freq, vocab
#pdb.set_trace()
freq_sorted = sorted(freq.items(), key=operator.itemgetter(1))
tokens = freq_sorted[:k]
vocab = dict(zip(tokens, range(1, len(tokens) + 1)))
Expand All @@ -143,7 +133,7 @@ def gen_sequence():

X, y = [], []
for tweet in tweets:
text = Tokenize(tweet['text'])
text = TOKENIZER(tweet['text'].lower())
text = ''.join([c for c in text if c not in punctuation])
words = text.split()
words = [word for word in words if word not in STOPWORDS]
Expand All @@ -155,16 +145,9 @@ def gen_sequence():
return X, y


def Tokenize(tweet):
#return MyTokenizer.tokenize(tweet)
#pdb.set_trace()
return tokenizer_g(tweet)


def shuffle_weights(model):
weights = model.get_weights()
weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights]
#pdb.set_trace()
model.set_weights(weights)


Expand Down Expand Up @@ -207,46 +190,56 @@ def cnn_model(sequence_length, embedding_dim):
# main sequential model
model = Sequential()
#if not model_variation=='CNN-rand':
model.add(Embedding(len(vocab)+1, embedding_dim, input_length=sequence_length, trainable=False))
model.add(Embedding(len(vocab)+1, embedding_dim, input_length=sequence_length, trainable=LEARN_EMBEDDINGS))
model.add(Dropout(dropout_prob[0]))#, input_shape=(sequence_length, embedding_dim)))
model.add(graph)
model.add(Dropout(dropout_prob[1]))
model.add(Activation('relu'))
model.add(Dense(n_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(loss=LOSS_FUN, optimizer=OPTIMIZER, metrics=['accuracy'])
print model.summary()
return model

def train_CNN(X, y, inp_dim, model, weights, epochs=10, batch_size=128):
cv_object = KFold(n_splits=10, shuffle=True, random_state=42)

def train_CNN(X, y, inp_dim, model, weights, epochs=EPOCHS, batch_size=BATCH_SIZE):
cv_object = KFold(n_splits=NO_OF_FOLDS, shuffle=True, random_state=42)
print cv_object
p, r, f1 = 0., 0., 0.
p1, r1, f11 = 0., 0., 0.
sentence_len = X.shape[1]
for train_index, test_index in cv_object.split(X):
shuffle_weights(model)
model.layers[0].set_weights([weights])
X_train, y_train = X[train_index], y[train_index]
if INITIALIZE_WEIGHTS_WITH == "glove":
model.layers[0].set_weights([weights])
elif INITIALIZE_WEIGHTS_WITH == "random":
shuffle_weights(model)
else:
print "ERROR!"
return

X_train, y_train = X[train_index], y[train_index]
X_test, y_test = X[test_index], y[test_index]
#pdb.set_trace()
y_train = y_train.reshape((len(y_train), 1))
X_temp = np.hstack((X_train, y_train))
for epoch in xrange(epochs):
for X_batch in batch_gen(X_temp, batch_size):
x = X_batch[:, :sentence_len]
y_temp = X_batch[:, sentence_len]
class_weights = {}
class_weights[0] = np.where(y_temp == 0)[0].shape[0]/float(len(y_temp))
class_weights[1] = np.where(y_temp == 1)[0].shape[0]/float(len(y_temp))
class_weights[2] = np.where(y_temp == 2)[0].shape[0]/float(len(y_temp))

class_weights = None
if SCALE_LOSS_FUN:
class_weights = {}
class_weights[0] = np.where(y_temp == 0)[0].shape[0]/float(len(y_temp))
class_weights[1] = np.where(y_temp == 1)[0].shape[0]/float(len(y_temp))
class_weights[2] = np.where(y_temp == 2)[0].shape[0]/float(len(y_temp))

try:
y_temp = np_utils.to_categorical(y_temp, nb_classes=3)
except Exception as e:
print e
print y_temp
print x.shape, y.shape
loss, acc = model.train_on_batch(x, y_temp)#, class_weight=class_weights)
loss, acc = model.train_on_batch(x, y_temp, class_weight=class_weights)
print loss, acc
y_pred = model.predict_on_batch(X_test)
y_pred = np.argmax(y_pred, axis=1)
Expand All @@ -261,17 +254,59 @@ def train_CNN(X, y, inp_dim, model, weights, epochs=10, batch_size=128):
f11 += f1_score(y_test, y_pred, average='micro')

print "macro results are"
print "average precision is %f" %(p/10)
print "average recall is %f" %(r/10)
print "average f1 is %f" %(f1/10)
print "average precision is %f" %(p/NO_OF_FOLDS)
print "average recall is %f" %(r/NO_OF_FOLDS)
print "average f1 is %f" %(f1/NO_OF_FOLDS)

print "micro results are"
print "average precision is %f" %(p1/10)
print "average recall is %f" %(r1/10)
print "average f1 is %f" %(f11/10)
print "average precision is %f" %(p1/NO_OF_FOLDS)
print "average recall is %f" %(r1/NO_OF_FOLDS)
print "average f1 is %f" %(f11/NO_OF_FOLDS)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description='CNN based models for twitter Hate speech detection')
parser.add_argument('-f', '--embeddingfile', required=True)
parser.add_argument('-d', '--dimension', required=True)
parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True)
parser.add_argument('--loss', default=LOSS_FUN, required=True)
parser.add_argument('--optimizer', default=OPTIMIZER, required=True)
parser.add_argument('--epochs', default=EPOCHS, required=True)
parser.add_argument('--batch-size', default=BATCH_SIZE, required=True)
parser.add_argument('-s', '--seed', default=SEED)
parser.add_argument('--folds', default=NO_OF_FOLDS)
parser.add_argument('--class_weight')
parser.add_argument('--initialize-weights', choices=['random', 'glove'], required=True)
parser.add_argument('--learn-embeddings', action='store_true', default=False)
parser.add_argument('--scale-loss-function', action='store_true', default=False)
args = parser.parse_args()

GLOVE_MODEL_FILE = args.embeddingfile
EMBEDDING_DIM = int(args.dimension)
SEED = int(args.seed)
NO_OF_FOLDS = int(args.folds)
CLASS_WEIGHT = args.class_weight
LOSS_FUN = args.loss
OPTIMIZER = args.optimizer
if args.tokenizer == "glove":
TOKENIZER = glove_tokenize
elif args.tokenizer == "nltk":
TOKENIZER = tokenize_nltk.casual.TweetTokenizer(strip_handles=True, reduce_len=True).tokenize
INITIALIZE_WEIGHTS_WITH = args.initialize_weights
LEARN_EMBEDDINGS = args.learn_embeddings
EPOCHS = int(args.epochs)
BATCH_SIZE = int(args.batch_size)
SCALE_LOSS_FUN = args.scale_loss_function



print 'GLOVE embedding: %s' %(GLOVE_MODEL_FILE)
print 'Embedding Dimension: %d' %(EMBEDDING_DIM)
print 'Allowing embedding learning: %s' %(str(LEARN_EMBEDDINGS))

word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE)
np.random.seed(SEED)


Tweets = select_tweets()
tweets = Tweets
Expand Down
23 changes: 21 additions & 2 deletions lstm.py
Expand Up @@ -48,6 +48,9 @@
MAX_SEQUENCE_LENGTH = None
INITIALIZE_WEIGHTS_WITH = None
LEARN_EMBEDDINGS = None
EPOCHS = 10
BATCH_SIZE = 512
SCALE_LOSS_FUN = None

word2vec_model = None

Expand Down Expand Up @@ -162,7 +165,7 @@ def lstm_model(sequence_length, embedding_dim):
return model


def train_LSTM(X, y, model, inp_dim, weights, epochs=10, batch_size=512):
def train_LSTM(X, y, model, inp_dim, weights, epochs=EPOCHS, batch_size=BATCH_SIZE):
cv_object = KFold(n_splits=NO_OF_FOLDS, shuffle=True, random_state=42)
print cv_object
p, r, f1 = 0., 0., 0.
Expand All @@ -184,13 +187,21 @@ def train_LSTM(X, y, model, inp_dim, weights, epochs=10, batch_size=512):
for X_batch in batch_gen(X_temp, batch_size):
x = X_batch[:, :sentence_len]
y_temp = X_batch[:, sentence_len]

class_weights = None
if SCALE_LOSS_FUN:
class_weights = {}
class_weights[0] = np.where(y_temp == 0)[0].shape[0]/float(len(y_temp))
class_weights[1] = np.where(y_temp == 1)[0].shape[0]/float(len(y_temp))
class_weights[2] = np.where(y_temp == 2)[0].shape[0]/float(len(y_temp))

try:
y_temp = np_utils.to_categorical(y_temp, nb_classes=3)
except Exception as e:
print e
print y_temp
print x.shape, y.shape
loss, acc = model.train_on_batch(x, y_temp)#, class_weight=class_weights)
loss, acc = model.train_on_batch(x, y_temp, class_weight=class_weights)
print loss, acc

y_pred = model.predict_on_batch(X_test)
Expand Down Expand Up @@ -224,12 +235,16 @@ def train_LSTM(X, y, model, inp_dim, weights, epochs=10, batch_size=512):
parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True)
parser.add_argument('--loss', default=LOSS_FUN, required=True)
parser.add_argument('--optimizer', default=OPTIMIZER, required=True)
parser.add_argument('--epochs', default=EPOCHS, required=True)
parser.add_argument('--batch-size', default=BATCH_SIZE, required=True)
parser.add_argument('-s', '--seed', default=SEED)
parser.add_argument('--folds', default=NO_OF_FOLDS)
parser.add_argument('--kernel', default=KERNEL)
parser.add_argument('--class_weight')
parser.add_argument('--initialize-weights', choices=['random', 'glove'], required=True)
parser.add_argument('--learn-embeddings', action='store_true', default=False)
parser.add_argument('--scale-loss-function', action='store_true', default=False)


args = parser.parse_args()
GLOVE_MODEL_FILE = args.embeddingfile
Expand All @@ -246,6 +261,10 @@ def train_LSTM(X, y, model, inp_dim, weights, epochs=10, batch_size=512):
TOKENIZER = tokenize_nltk.casual.TweetTokenizer(strip_handles=True, reduce_len=True).tokenize
INITIALIZE_WEIGHTS_WITH = args.initialize_weights
LEARN_EMBEDDINGS = args.learn_embeddings
EPOCHS = int(args.epochs)
BATCH_SIZE = int(args.batch_size)
SCALE_LOSS_FUN = args.scale_loss_function



np.random.seed(SEED)
Expand Down

0 comments on commit 57e60e0

Please sign in to comment.