In [432]:
from keras.layers import Dense, Dropout
import numpy as np
from keras.models import Model, Sequential
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder

In [415]:
class AbbrNetwork:
    
    def __init__(self):
        
        self.model = self._generate_model()
        self.vectorizer = self._generate_vectorizer()
        self.set_training_data()
        
    def _generate_model(self):
        from keras.layers import Dense, Dropout
        from keras.models import Sequential
        
        model.add(Dense(25, activation='relu', input_shape=(52,)))
        model.add(Dropout(0.3))
        model.add(Dense(10, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model
    
    def set_training_data(self):
        self._training_data = ['Lorem','IPSUm','DOLOr','SiT','amet','conSectetur','ADIPSiCing','elit','Nullam','sed','HENDrieT','dolor','Aliquam','iaculis','dui','ut','varius','dignissim','Suspendisse','rutrum','sem','quam','Curabitur','cursus','mi','at','nunc','interdum','rhoncus','Donec','hendrerit','sapien','ex','sit','amet','rhoncus','massa','posuere','eu','Vivamus','hendrerit','libero','vitae','risus','posuere','a','rutrum','diam','euismod','Aliquam','a','turpis','ut','neque','laoreet','lobortis','non','in','mauris','Sed','a','nisl','eget','ipsum','feugiat','tincidunt','at','rhoncus','velit','Nunc','iaculis','elit','sed','hendrerit','imperdiet','Suspendisse','auctor','augue','id','efficitur','blandit','Ut','quis','lectus','non','libero','vehicula','efficitur','Aliquam','ut','elementum','mi','vulputate','eleifend','ex','Suspendisse','ornare','finibus','turpis','sit','amet','cursus','Donec','viverra','eget','nisi','eu','sollicitudin','Aenean','posuere','metus','sed','risus','posuere','faucibus','Proin','faucibus','sem','dui','non','mollis','lacus','luctus','quis','Integer','vitae','hendrerit','massa','Pellentesque','habitant','morbi','tristique','senectus','et','netus','et','malesuada','fames','ac','turpis','egestas','Praesent','viverra','sollicitudin','nisi','et','consequat','odio','congue','in','Curabitur','urna','ipsum','elementum','at','congue','ut','faucibus','quis','sem','Morbi','porttitor','ullamcorper','sapien','tincidunt','egestas','nunc','Sed','nibh','ante','bibendum','non','lacus','ut','aliquet','mollis','sapien','Nullam','vel','posuere','ex','Nunc','eget','elementum','purus','Sed','eu','nibh','euismod','porta','ex','vitae','tristique','augue','Phasellus','fringilla','fermentum','justo','non','imperdiet','Vivamus','molestie','porttitor','sapien','sed','efficitur','lorem','ullamcorper','at','Integer','dignissim','magna','a','mauris','fringilla','dapibus','Cras','quis','porta','lacus','id','aliquam','erat','Etiam','dignissim','libero','tempus','placerat','vulputate','turpis','arcu','eleifend','nibh','sagittis','elementum','massa','tellus','vitae','felis','Pellentesque','euismod','risus','vel','sem','cursus','semper','Aliquam','sit','amet','dui','a','magna','euismod','ultricies','rutrum','sed','ex','Donec','a','ligula','lorem','Nam','velit','quam','dignissim','sit','amet','magna','sit','amet','efficitur','pretium','urna','Suspendisse','pretium','aliquet','rhoncus','Aliquam','eros','nunc','pulvinar','at','ante','vel','viverra','auctor','erat']
        
    
    def _generate_vectorizer(self):
        from sklearn.feature_extraction.text import CountVectorizer
        
        #create vectorizer corpus
        vectorizer_corpus = [' '.join([chr(i) for i in range(65, 123) if i < 91 or i > 96])]
        
        #vectorizer 
        vectorizer = CountVectorizer(binary=True, 
                                     lowercase=False,
                                     analyzer = "word",
                                     tokenizer = None,
                                     preprocessor = None,
                                     stop_words = None,
                                     max_features = 5000,
                                     token_pattern = r"(?u)\b\w+\b")
        vectorizer.fit(vectorizer_corpus)
        return vectorizer
    
    def transform_training_corpus(self):
        small_words_corpus = [self._process_word(word) for word in self._training_data]
        outs = np.zeros(len(small_words_corpus))
        cap_words_corpus = [self._process_word((str(word).upper())) for word in self._training_data]
        cap_outs = np.ones(len(cap_words_corpus))
        y = np.expand_dims(np.append(outs, cap_outs),axis=1)
        X = np.append(small_words_corpus, cap_words_corpus, axis=0)
        return (X,y)
    
    def _process_word(self, word):
        chars = ' '.join(list(str(word)))
        return (vectorizer.transform([chars]).toarray().flatten())
        
        
    def predict(self, para):
        labels = ["Not Abbrevation", "Abbrevation"]
        words = para.split(" ")
        vectors = [self._process_word(word) for word in words]
        return [(words[idx], labels[int(preds)]) for idx,preds in enumerate(np.round(self.model.predict([vectors]).flatten()))]

        #[(words[idx], labels[preds]) for idx,preds in enumerate(np.round(model.predict([vectors]).flatten()))]
        
    
    def train(self ):
        X,y = self.transform_training_corpus()
        self.model.fit(x = X, y = y, batch_size = 64, verbose=0, epochs=40)
        print('Training completed.')

In [416]:
abbr = AbbrNetwork()
abbr.train()

Training completed.


In [433]:
abbr.predict("pavan is new to PAN")

[('pavan', 'Not Abbrevation'),
 ('is', 'Not Abbrevation'),
 ('new', 'Not Abbrevation'),
 ('to', 'Not Abbrevation'),
 ('PAN', 'Abbrevation')]