<a href="https://colab.research.google.com/github/manjotmb20/Text-Classification/blob/master/Colab's_New_Code_Editor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np

In [0]:
from keras.models import Sequential
from keras.layers import LSTM, TimeDistributed,RepeatVector, Dense

In [12]:
class CharacterTable(object):
  def __init__(self,chars):
    self.chars=sorted(set(chars))
    self.char_indices=dict((c,i) for i,c in enumerate(self.chars))
    self.indices_char=dict((i,c) for i,c in enumerate(self.chars))

  def encode(self,C,num_rows):
    x=np.zeros((num_rows,len(self.chars)))
    for i,c in enumerate(C):
      x[i,self.char_indices[c]]=1
    return x
  def decode(self,x,calc_argmax=True):
    if calc_argmax:
      x=x.argmax(axis=-1)
    return ''.join(self.indices_char[x] for x in x)

training_size=50000
digits=5
hidden_size=128
batch_size=128

maxlen=digits+digits+1
chars='0123456789+- '
ctable=CharacterTable(chars)
questions=[]
expected=[]
seen=set()
print('Generating Data.....')
while len(questions)<training_size:
  f=lambda: int(''.join(np.random.choice(list('0123456789')) for i in range(np.random.randint(1,digits+1))))
  a, b=f(), f()
  key=tuple(sorted((a,b)))
  if key in seen:
    continue
  seen.add(key)
  q='{}-{}'.format(a,b)
  query=q+' '*(maxlen-len(q))
  ans=str(a-b)
  ans+=' '*(digits+1-len(ans))
  questions.append(query)
  expected.append(ans)
print('Total addition questions: ', len(questions))
print('Vectorization.....')

x=np.zeros((len(questions),maxlen,len(chars)), dtype=np.bool)
y=np.zeros((len(questions),digits+1,len(chars)),dtype=np.bool)
for i, sentence in enumerate(questions):
  x[i]=ctable.encode(sentence,maxlen)
for i, sentence in enumerate(expected):
  y[i]=ctable.encode(sentence,digits+1)    
indices=np.arange(len(y))
np.random.shuffle(indices)
x=x[indices]
y=y[indices]
split_at=len(x)-len(x)//10
(x_train, x_val)=x[:split_at], x[split_at:]
(y_train, y_val)=y[:split_at], y[split_at:]




Generating Data.....
Total addition questions:  50000
Vectorization.....


In [13]:
model=Sequential()
model.add(LSTM(hidden_size,input_shape=(maxlen, len(chars))))
model.add(RepeatVector(digits+1))
model.add(LSTM(hidden_size,return_sequences=True))
model.add(TimeDistributed(Dense(len(chars),activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 128)               72704     
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 6, 128)            0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 6, 128)            131584    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 6, 13)             1677      
Total params: 205,965
Trainable params: 205,965
Non-trainable params: 0
_________________________________________________________________


# Richer syntax highlighting

Improved support for nested languages:

```notebook-python
df = pd.io.gbq.read_gbq('''
  SELECT 
    REGEXP_EXTRACT(name, '[a-zA-Z]+'),
    SUM(number) as count
  FROM `bigquery-public-data.usa_names.usa_1910_2013`
  WHERE state = 'TX'
  GROUP BY name
  ORDER BY count DESC
  LIMIT 100
''')
```

In [14]:
for iteration in range(1,200):
  print()
  print('-'*50)
  print('Iteration', iteration)
  model.fit(x_train,y_train,batch_size=batch_size,epochs=1,validation_data=(x_val,y_val))

  for i in range(10):
    ind=np.random.randint(0,len(x_val))
    rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
    preds=model.predict_classes(rowx,verbose=0)
    q=ctable.decode(rowx[0])
    correct = ctable.decode(rowy[0])
    guess = ctable.decode(preds[0], calc_argmax=False)
    print('Q', q, end=' ')
    print('T', correct, end=' ')
    if correct == guess:
      print('☑', end=' ')
    else:
      print('☒', end=' ')
    print(guess)


--------------------------------------------------
Iteration 1
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 45000 samples, validate on 5000 samples
Epoch 1/1





Q 42793-2     T 42791  ☒ 45557 
Q 4748-6      T 4742   ☒ 4444  
Q 16-42810    T -42794 ☒ -44101
Q 2-65929     T -65927 ☒ -47000
Q 31-48       T -17    ☒ -4    
Q 8767-8      T 8759   ☒ 4447  
Q 13-4717     T -4704  ☒ -4670 
Q 0-9462      T -9462  ☒ -460  
Q 24081-10059 T 14022  ☒ 15000 
Q 10-77191    T -77181 ☒ -47700

--------------------------------------------------
Iteration 2
Train on 45000 samples, validate on 5000 samples
Epoch 1/1
Q 43-46       T -3     ☒ -4    
Q 5936-97     T 5839   ☒ 9999  
Q 74440-357   T 74083  ☒ 44478 
Q 54-928      T -874   ☒ -518  
Q 2-3288      T -3286  ☒ -7765 
Q 5248-9      T 5239   ☒ 5148  
Q 435-191     T 244    ☒ 113   
Q 7230-24     T 7206   ☒ 3333  
Q 3647-140    T 3507   ☒ 3438  
Q 6778-18268  T -11490 ☒ -7777 

-----------