In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
import json
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
actual_tags = []
with open ('/content/drive/MyDrive/Cleaned_Sentences_Task/tags_original.txt', 'r') as file:
    actual_tags = json.load(file)

In [None]:
removed = []
with open ('/content/drive/MyDrive/Cleaned_Sentences_Task/sentences_without_tags.txt', 'r') as file:
    removed = json.load(file)

In [None]:
print(len(removed))

331364


In [None]:
def extract_features(sentence, index):
  return {
      'word':sentence[index],
      'is_first':index==0,
      'is_last':index ==len(sentence)-1,
      'prefix-1':sentence[index][0],
      'prefix-2':sentence[index][:2],
      'prefix-3':sentence[index][:3],
      'prefix-3':sentence[index][:4],
      'suffix-1':sentence[index][-1],
      'suffix-2':sentence[index][-2:],
      'suffix-3':sentence[index][-3:],
      'suffix-3':sentence[index][-4:],
      'prev_word':'' if index == 0 else sentence[index-1],
      'next_word':'' if index == 1 else sentence[index+1],
      'has_hyphen': '-' in sentence[index],
      'is_numeric': sentence[index].isdigit()
  }

In [None]:
def transform_to_dataset(sentences):
  X, y = [], []
  for sents in sentences:
    for index in range(len(sents)):
      X.append(extract_features(sents, index))
  return X, actual_tags[0: 22000]

In [None]:
X_, y_ = transform_to_dataset(removed[0: 11000])
for i in range(0, 100):
  print(X_[i], " ----------> ", y_[i])

{'word': 'आग', 'is_first': True, 'is_last': False, 'prefix-1': 'आ', 'prefix-2': 'आग', 'prefix-3': 'आग', 'suffix-1': 'ग', 'suffix-2': 'आग', 'suffix-3': 'आग', 'prev_word': '', 'next_word': 'की', 'has_hyphen': False, 'is_numeric': False}  ---------->  nn
{'word': 'की', 'is_first': False, 'is_last': True, 'prefix-1': 'क', 'prefix-2': 'की', 'prefix-3': 'की', 'suffix-1': 'ी', 'suffix-2': 'की', 'suffix-3': 'की', 'prev_word': 'आग', 'next_word': '', 'has_hyphen': False, 'is_numeric': False}  ---------->  psp
{'word': 'लिए', 'is_first': True, 'is_last': False, 'prefix-1': 'ल', 'prefix-2': 'लि', 'prefix-3': 'लिए', 'suffix-1': 'ए', 'suffix-2': 'िए', 'suffix-3': 'लिए', 'prev_word': '', 'next_word': 'सांडिआ', 'has_hyphen': False, 'is_numeric': False}  ---------->  psp
{'word': 'सांडिआ', 'is_first': False, 'is_last': True, 'prefix-1': 'स', 'prefix-2': 'सा', 'prefix-3': 'सांड', 'suffix-1': 'आ', 'suffix-2': 'िआ', 'suffix-3': 'ंडिआ', 'prev_word': 'लिए', 'next_word': '', 'has_hyphen': False, 'is_numeric'

In [None]:
print(len(X_), len(y_))

22000 22000


In [None]:
print(y_[0: 10])
print(actual_tags[0: 10])

['nn', 'psp', 'psp', 'nnpc', 'psp', 'nnpc', 'vaux', 'vaux', 'nnp', 'psp']
['nn', 'psp', 'psp', 'nnpc', 'psp', 'nnpc', 'vaux', 'vaux', 'nnp', 'psp']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_, y_, train_size=0.75)

In [None]:
dict_vectorizer = DictVectorizer(sparse=False)
dict_vectorizer.fit(X_)
X_train = dict_vectorizer.transform(X_train)
X_test = dict_vectorizer.transform(X_test)

In [None]:
X_ = dict_vectorizer.transform(X_)

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(y_)
y_train = label_encoder.transform(y_train)
y_train_cat = to_categorical(y_train, num_classes=len(label_encoder.classes_))

In [None]:
model_deep = Sequential()
model_deep.add(Dense(1024, activation='relu'))
model_deep.add(Dropout(0.2))
model_deep.add(BatchNormalization())
model_deep.add(Dense(512, activation='relu'))
model_deep.add(Dropout(0.2))
model_deep.add(Dense(256, activation='relu'))
model_deep.add(Dropout(0.2))
model_deep.add(BatchNormalization())
model_deep.add(Dense(128, activation='relu'))
model_deep.add(Dropout(0.2))
model_deep.add(Dense(len(label_encoder.classes_), activation='softmax'))

In [None]:
model_deep.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model_deep.fit(X_train, y_train_cat, validation_split=0.2, epochs=50, batch_size=128)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f8893b7bb70>

In [None]:
model_deep.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1024)              22066176  
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
batch_normalization (BatchNo (None, 1024)              4096      
_________________________________________________________________
dense_1 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0

In [None]:
print("Accuracy Score = ", model_deep.evaluate(X_test, to_categorical(label_encoder.transform(y_test))))

Accuracy Score =  [1.0485262870788574, 0.8445217609405518]


In [None]:
predictions = list(model_deep.predict_classes(X_))
deep_tags = []
for i in predictions:
  deep_tags.extend(list(label_encoder.inverse_transform([i])))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [None]:
with open ('/content/drive/MyDrive/Cleaned_Sentences_Task/ann_predictions.txt', 'w+') as file:
    json.dump(deep_tags, file)