In [165]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, BatchNormalization, Dropout, Flatten, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

drive_dir = "/content/drive/MyDrive/DSAI Project/"

In [None]:
df1 = pd.read_csv(drive_dir + "mtsamplesV1.csv")
df2 = pd.read_csv(drive_dir + "mtsamplesV2.csv")
df3 = pd.read_csv(drive_dir + "back_translated.csv")

df2 = df2.rename(columns={'medical_specialty': 'Domain', 'transcription': 'Transcriptions'})
df3 = df3.rename(columns={'medical_specialty': 'Domain', 'backTranslated' : 'Transcriptions'})
df3 = df3.drop(columns=['transcription','Unnamed: 0', 'french'])

print(df1.shape, df1.columns)
print(df2.shape, df2.columns)
print(df3.shape, df3.columns)

(75, 2) Index(['Domain', 'Transcriptions'], dtype='object')
(1239, 2) Index(['Domain', 'Transcriptions'], dtype='object')
(991, 2) Index(['Domain', 'Transcriptions'], dtype='object')


In [None]:
!pip install text-preprocessing

In [None]:
from text_preprocessing import preprocess_text
from text_preprocessing import to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word
from text_preprocessing import remove_number, remove_special_character, normalize_unicode, remove_stopword

# concat 3 df
df = pd.concat([df1,df2,df3])
df['Domain'] = df['Domain'].str.strip().str.capitalize()
print(df.shape)
print(df['Domain'].value_counts())

# preprocess transcriptions
preprocess_functions = [to_lower, remove_email, remove_number, remove_special_character, remove_url, remove_punctuation, lemmatize_word, remove_stopword]
df['Transcriptions'] = df['Transcriptions'].apply(lambda x: preprocess_text(str(x), preprocess_functions))

# one hot encoding of labels
encoder = OneHotEncoder()
labels = df['Domain']
encoded_labels = encoder.fit_transform(labels.values.reshape(-1, 1))

(2305, 2)
Orthopedic          659
Radiology           501
Gastroenterology    434
Neurology           416
Urology             295
Name: Domain, dtype: int64


In [158]:
# train test split
x_train, x_val, y_train, y_val = train_test_split(df['Transcriptions'].values,encoded_labels, test_size=0.2, random_state=50)

## Embeddings try1

In [159]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
total_words = len(tokenizer.word_index) + 1

# print(tokenizer.word_index)
print(total_words)
max_seq_len = 100

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_val_seq = tokenizer.texts_to_sequences(x_val)
x_train_pad = pad_sequences(x_train_seq, maxlen=max_seq_len)
x_val_pad = pad_sequences(x_val_seq, maxlen=max_seq_len)

22171


In [174]:
embed_length = 100

model = Sequential([
    Embedding(input_dim=total_words, output_dim=embed_length, input_length=max_seq_len),
    Flatten(),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    BatchNormalization(),
    Dense(5, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [175]:
history = model.fit(np.array(x_train_pad), np.array(y_train), validation_data=(np.array(x_val_pad), np.array(y_val)), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Transformers

In [None]:
x_train