In [1]:
## Model to check if a word is abbreviation or not

import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
# dataset containing abbreviations and normal words with label as 1 and 0 respectively

data = pd.read_csv('words_data.csv')
data.drop("Unnamed: 0", axis = 1, inplace = True)

print(data.head())

       words  label
0  HCoV-229E      1
1       EtOH      1
2         GC      1
3         US      1
4        PFD      1


In [3]:
## shuffling the data

data = data.sample(frac = 1)
data = data.sample(frac = 1)
data = data.sample(frac = 1)
data = data.sample(frac = 1)

data.reset_index(drop = True, inplace = True)

In [4]:
print(data.head(10))

        words  label
0       akira      0
1         ffa      0
2    intercom      0
3     trivial      0
4         HSC      1
5  hemoglobin      0
6       RANKL      1
7  exclusions      0
8          BP      1
9     bottoms      0


In [5]:
## distributing the data as train and validation set

train_data = data.iloc[:10000, :]
val_data = data.iloc[10000:, :]

In [6]:
## removing non numeric and non alphabetical characters

train_data['words'] = train_data['words'].str.replace("[^a-zA-Z0-9]", "")
val_data['words'] = val_data['words'].str.replace("[^a-zA-Z0-9]", "")
val_data.reset_index(drop = True, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
## A list of all the characters that the dataset contains

list_of_chars = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
                 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4',
                 '5', '6', '7', '8', '9']

In [8]:
char2idx = {u:i+1 for i, u in enumerate(list_of_chars)}
print(char2idx)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 34, 'I': 35, 'J': 36, 'K': 37, 'L': 38, 'M': 39, 'N': 40, 'O': 41, 'P': 42, 'Q': 43, 'R': 44, 'S': 45, 'T': 46, 'U': 47, 'V': 48, 'W': 49, 'X': 50, 'Y': 51, 'Z': 52, '0': 53, '1': 54, '2': 55, '3': 56, '4': 57, '5': 58, '6': 59, '7': 60, '8': 61, '9': 62}


In [9]:
## labeling the data using char2idx dictionary

training_words = []
for i in range(0, len(train_data)):
    current_word = []
    for j in range(0, len(train_data['words'][i])):
        current_word.append(char2idx[train_data['words'][i][j]])
    training_words.append(current_word)

val_words = []
for i in range(0, len(val_data)):
    current_word = []
    for j in range(0, len(val_data['words'][i])):
        current_word.append(char2idx[val_data['words'][i][j]])
    val_words.append(current_word)

In [10]:
print(training_words[:5])
print(val_words[:5])

[[1, 11, 9, 18, 1], [6, 6, 1], [9, 14, 20, 5, 18, 3, 15, 13], [20, 18, 9, 22, 9, 1, 12], [34, 45, 29]]
[[1, 18, 13, 1, 14, 4], [16, 18, 15, 16, 15, 14, 5, 14, 20], [21, 18, 9, 14, 5], [3, 8, 5, 18, 18, 25], [19, 20, 18, 1, 14, 7, 5, 18, 19]]


In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
## padding the data with maxlength as the maximum length words in training words

max_length = max([len(x) for x in training_words])
padded = pad_sequences(training_words, maxlen = max_length, padding = 'post', truncating = 'post')
train_labels = list(train_data['label'])
train_labels = np.array(train_labels)

In [13]:
val_padded = pad_sequences(val_words, maxlen = max_length, padding = 'post', truncating = 'post')
val_labels = list(val_data['label'])
val_labels = np.array(val_labels)

In [14]:
print(padded[:5])
print(val_padded[:5])

[[ 1 11  9 18  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 6  6  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 9 14 20  5 18  3 15 13  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [20 18  9 22  9  1 12  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [34 45 29  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[ 1 18 13  1 14  4  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [16 18 15 16 15 14  5 14 20  0  0  0  0  0  0  0  0  0  0  0  0]
 [21 18  9 14  5  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 3  8  5 18 18 25  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [19 20 18  1 14  7  5 18 19  0  0  0  0  0  0  0  0  0  0  0  0]]


In [15]:
## training the model

model = tf.keras.Sequential([
        tf.keras.layers.Embedding(63, 25, input_length = max_length),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
        tf.keras.layers.Dense(16, activation = 'relu'),
        tf.keras.layers.Dense(1, activation = 'sigmoid')
    ])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

model.fit(padded, train_labels, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x236d39c4ac8>

In [16]:
val_pred = model.predict(val_padded)
for i in range(0, len(val_pred)):
    if(val_pred[i] > 0.5):
        val_pred[i] = 1
    else:
        val_pred[i] = 0

In [17]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(val_labels, val_pred)

In [18]:
print(cm)

[[2171    0]
 [   7  628]]


In [19]:
train_data['words'].head()

0       akira
1         ffa
2    intercom
3     trivial
4         HSC
Name: words, dtype: object

In [21]:
train_data['words'] = train_data['words'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
val_data['words'] = val_data['words'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [25]:
print(train_data.head(10))
print(val_data.head(10))

        words  label
0       akira      0
1         ffa      0
2    intercom      0
3     trivial      0
4         hsc      1
5  hemoglobin      0
6       rankl      1
7  exclusions      0
8          bp      1
9     bottoms      0
       words  label
0     armand      0
1  proponent      0
2      urine      0
3     cherry      0
4  strangers      0
5        iop      1
6        lnf      1
7        odi      1
8  observers      0
9         hs      1


In [26]:
list_of_chars = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
                 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4',
                 '5', '6', '7', '8', '9']

char2idx = {u:i+1 for i, u in enumerate(list_of_chars)}
print(char2idx)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '0': 27, '1': 28, '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34, '8': 35, '9': 36}


In [27]:
training_words = []
for i in range(0, len(train_data)):
    current_word = []
    for j in range(0, len(train_data['words'][i])):
        current_word.append(char2idx[train_data['words'][i][j]])
    training_words.append(current_word)

val_words = []
for i in range(0, len(val_data)):
    current_word = []
    for j in range(0, len(val_data['words'][i])):
        current_word.append(char2idx[val_data['words'][i][j]])
    val_words.append(current_word)

In [28]:
max_length = max([len(x) for x in training_words])
padded = pad_sequences(training_words, maxlen = max_length, padding = 'post', truncating = 'post')
train_labels = list(train_data['label'])
train_labels = np.array(train_labels)

val_padded = pad_sequences(val_words, maxlen = max_length, padding = 'post', truncating = 'post')
val_labels = list(val_data['label'])
val_labels = np.array(val_labels)

In [30]:
model = tf.keras.Sequential([
        tf.keras.layers.Embedding(63, 25, input_length = max_length),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
        tf.keras.layers.Dense(16, activation = 'relu'),
        tf.keras.layers.Dense(1, activation = 'sigmoid')
    ])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

model.fit(padded, train_labels, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x236dd36ee48>

In [29]:
print(padded[:5])
print(val_padded[:5])

[[ 1 11  9 18  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 6  6  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 9 14 20  5 18  3 15 13  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [20 18  9 22  9  1 12  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 8 19  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[ 1 18 13  1 14  4  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [16 18 15 16 15 14  5 14 20  0  0  0  0  0  0  0  0  0  0  0  0]
 [21 18  9 14  5  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 3  8  5 18 18 25  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [19 20 18  1 14  7  5 18 19  0  0  0  0  0  0  0  0  0  0  0  0]]


In [31]:
val_pred = model.predict(val_padded)
for i in range(0, len(val_pred)):
    if(val_pred[i] > 0.5):
        val_pred[i] = 1
    else:
        val_pred[i] = 0

In [32]:
cm = confusion_matrix(val_labels, val_pred)
print(cm)

[[1983  188]
 [ 127  508]]


In [33]:
print((1983+508)/(1983+508+127+188))

0.8877405559515325


In [None]:
## val_accuracy: 88.7%