Skip to content

Commit

Permalink
minor fix
Browse files Browse the repository at this point in the history
  • Loading branch information
shashankg7 committed Jan 10, 2017
1 parent ff0da3d commit f037912
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 22 deletions.
16 changes: 8 additions & 8 deletions cnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,15 @@
vocab, reverse_vocab = {}, {}
freq = defaultdict(int)
tweets = {}

np.random.seed(42)

def get_embedding(word):
#return
try:
return word2vec_model[word]
except Exception, e:
print 'Encoding not found: %s' %(word)
return np.zeros(EMBEDDING_DIM)
return np.zeros(EMBEDDING_DIM)

def get_embedding_weights():
embedding = np.zeros((len(vocab) + 1, EMBEDDING_DIM))
Expand Down Expand Up @@ -154,7 +154,7 @@ def gen_sequence():
y.append(y_map[tweet['label']])
return X, y


def Tokenize(tweet):
#return MyTokenizer.tokenize(tweet)
#pdb.set_trace()
Expand Down Expand Up @@ -182,8 +182,8 @@ def cnn_model(sequence_length, embedding_dim):

# Training parameters
# Word2Vec parameters, see train_word2vec
#min_word_count = 1 # Minimum word count
#context = 10 # Context window size
#min_word_count = 1 # Minimum word count
#context = 10 # Context window size

graph_in = Input(shape=(sequence_length, embedding_dim))
convs = []
Expand All @@ -196,7 +196,7 @@ def cnn_model(sequence_length, embedding_dim):
pool = GlobalMaxPooling1D()(conv)
#flatten = Flatten()(pool)
convs.append(pool)

if len(filter_sizes)>1:
out = Merge(mode='concat')(convs)
else:
Expand Down Expand Up @@ -277,7 +277,7 @@ def train_CNN(X, y, inp_dim, model, weights, epochs=10, batch_size=128):
tweets = Tweets
gen_vocab()
#filter_vocab(20000)
X, y = gen_sequence()
X, y = gen_sequence()
#Y = y.reshape((len(y), 1))
MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X))
print "max seq length is %d"%(MAX_SEQUENCE_LENGTH)
Expand All @@ -287,7 +287,7 @@ def train_CNN(X, y, inp_dim, model, weights, epochs=10, batch_size=128):
W = get_embedding_weights()
model = cnn_model(data.shape[1], EMBEDDING_DIM)
train_CNN(data, y, EMBEDDING_DIM, model, W)

pdb.set_trace()


14 changes: 7 additions & 7 deletions fast_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
print('Found %s texts. (samples)' % len(texts))

EMBEDDING_DIM = int(sys.argv[1])

np.random.seed(42)
# Load the orginal glove file
# SHASHANK files
#GLOVE_MODEL_FILE="/home/shashank/DL_NLP/glove-twitter" + str(EMBEDDING_DIM) + "-w2v"
Expand Down Expand Up @@ -66,7 +66,7 @@ def get_embedding(word):
return word2vec_model[word]
except Exception, e:
print 'Encoding not found: %s' %(word)
return np.zeros(EMBEDDING_DIM)
return np.zeros(EMBEDDING_DIM)

def get_embedding_weights():
embedding = np.zeros((len(vocab) + 1, EMBEDDING_DIM))
Expand Down Expand Up @@ -150,7 +150,7 @@ def gen_sequence():
y.append(y_map[tweet['label']])
return X, y


def Tokenize(tweet):
#return MyTokenizer.tokenize(tweet)
#pdb.set_trace()
Expand Down Expand Up @@ -217,7 +217,7 @@ def train_fasttext(X, y, model, inp_dim,embedding_weights, epochs=10, batch_size
r1 += recall_score(y_test, y_pred, average='micro')
f1 += f1_score(y_test, y_pred, average='weighted')
f11 += f1_score(y_test, y_pred, average='micro')

print "macro results are"
print "average precision is %f" %(p/10)
print "average recall is %f" %(r/10)
Expand All @@ -235,7 +235,7 @@ def check_semantic_sim(embedding_table, word):
sim_word_idx = get_similar_words(embedding_table, embedding_table[vocab[word]], 25)
sim_words = map(lambda x:reverse_vocab[x[1]], sim_word_idx)
print sim_words

def tryWord(embedding_table):
while True:
print "enter word"
Expand All @@ -253,15 +253,15 @@ def tryWord(embedding_table):
Tweets = select_tweets()
tweets = Tweets
gen_vocab()
X, y = gen_sequence()
X, y = gen_sequence()
MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X))
print "max seq length is %d"%(MAX_SEQUENCE_LENGTH)
data = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
y = np.array(y)
W = get_embedding_weights()
data, y = sklearn.utils.shuffle(data, y)
model = fast_text_model(data.shape[1])
_ = train_fasttext(data, y, model, EMBEDDING_DIM, W)
_ = train_fasttext(data, y, model, EMBEDDING_DIM, W)
table = model.layers[0].get_weights()[0]
#check_semantic_sim(table)
tryWord(table)
Expand Down
14 changes: 7 additions & 7 deletions lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
MAX_NB_WORDS = None
VALIDATION_SPLIT = 0.2
word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE)

np.random.seed(42)

# vocab generation
MyTokenizer = tokenize.casual.TweetTokenizer(strip_handles=True, reduce_len=True)
Expand All @@ -65,7 +65,7 @@ def get_embedding(word):
return word2vec_model[word]
except Exception, e:
print 'Encoding not found: %s' %(word)
return np.zeros(EMBEDDING_DIM)
return np.zeros(EMBEDDING_DIM)

def get_embedding_weights():
embedding = np.zeros((len(vocab) + 1, EMBEDDING_DIM))
Expand Down Expand Up @@ -149,7 +149,7 @@ def gen_sequence():
y.append(y_map[tweet['label']])
return X, y


def Tokenize(tweet):
#return MyTokenizer.tokenize(tweet)
#pdb.set_trace()
Expand Down Expand Up @@ -204,7 +204,7 @@ def train_LSTM(X, y, model, inp_dim, weights, epochs=10, batch_size=512):
print x.shape, y.shape
loss, acc = model.train_on_batch(x, y_temp)#, class_weight=class_weights)
print loss, acc

y_pred = model.predict_on_batch(X_test)
y_pred = np.argmax(y_pred, axis=1)
print classification_report(y_test, y_pred)
Expand All @@ -216,7 +216,7 @@ def train_LSTM(X, y, model, inp_dim, weights, epochs=10, batch_size=512):
r1 += recall_score(y_test, y_pred, average='micro')
f1 += f1_score(y_test, y_pred, average='weighted')
f11 += f1_score(y_test, y_pred, average='micro')


print "macro results are"
print "average precision is %f" %(p/10)
Expand All @@ -234,7 +234,7 @@ def train_LSTM(X, y, model, inp_dim, weights, epochs=10, batch_size=512):
tweets = Tweets
gen_vocab()
#filter_vocab(20000)
X, y = gen_sequence()
X, y = gen_sequence()
#Y = y.reshape((len(y), 1))
MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X))
print "max seq length is %d"%(MAX_SEQUENCE_LENGTH)
Expand All @@ -245,5 +245,5 @@ def train_LSTM(X, y, model, inp_dim, weights, epochs=10, batch_size=512):
model = lstm_model(data.shape[1], EMBEDDING_DIM)
#model = lstm_model(data.shape[1], 25, get_embedding_weights())
train_LSTM(data, y, model, EMBEDDING_DIM, W)

pdb.set_trace()

0 comments on commit f037912

Please sign in to comment.