# Part 6: NLP

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
# Get the Final Narrative Dataset
link = 'https://drive.google.com/file/d/16yKUYFo_k7vDvIVjtgfAE_DpZemEIxsz/view'

# to get the id part of the file
id = link.split("/")[-2]

downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('Final Narrative.csv')

df = pd.read_csv('Final Narrative.csv', index_col=0)

# 8. NLP

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77202 entries, 0 to 77207
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Final Narrative      77202 non-null  object 
 1   Hospitalized_binary  77202 non-null  float64
 2   Amputation_binary    77196 non-null  float64
dtypes: float64(2), object(1)
memory usage: 2.4+ MB


In [5]:
df = df.dropna().copy()

In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split

# Separating the text and labels
texts = df["Final Narrative"].values
labels_hospitalized = df["Hospitalized_binary"].values
labels_amputation = df["Amputation_binary"].values

# Using Tokenizer to tokenize the text and convert it to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Padding the sequences to a fixed length (e.g., the length of the longest sequence)
max_sequence_length = max([len(sequence) for sequence in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Splitting the data into training and testing sets
X_train, X_test, y_train_hospitalized, y_test_hospitalized, y_train_amputation, y_test_amputation = train_test_split(
    sequences, labels_hospitalized, labels_amputation, test_size=0.2, random_state=42
)

# Defining the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=max_sequence_length))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

# Compiling the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 386, 64)           1049920   
                                                                 
 lstm (LSTM)                 (None, 128)               98816     
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,148,865
Trainable params: 1,148,865
Non-trainable params: 0
_________________________________________________________________


In [7]:
# Training the model for the Hospitalized_binary column
print('Prediction of Hospitalization:')
model.fit(X_train, y_train_hospitalized, validation_data=(X_test, y_test_hospitalized), epochs=10, batch_size=64)
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print('Evaluation:')
model.evaluate(X_test, y_test_hospitalized)

Prediction of Hospitalization:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Evaluation:


[0.2592056095600128, 0.9072538614273071]

In [9]:
text1 = "The worker fell from a height and was taken to the hospital"
text2 = "The worker's hand went under the press."

seq1 = tokenizer.texts_to_sequences([text1])
pad1 = pad_sequences(seq1, maxlen = max_sequence_length, padding = 'pre', truncating= 'pre')
seq2 = tokenizer.texts_to_sequences([text2])
pad2 = pad_sequences(seq2, maxlen = max_sequence_length, padding = 'pre', truncating= 'pre')

print(model.predict(pad1))
print(model.predict(pad2))

[[0.9939765]]
[[0.9879594]]


In [10]:
# Training the model for the Amputation_binary column
print('Prediction of Amputation:')
model.fit(X_train, y_train_amputation, validation_data=(X_test, y_test_amputation), epochs=10, batch_size=64)
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print('Evaluation:')
model.evaluate(X_test, y_test_amputation)

Prediction of Amputation:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Evaluation:


[0.09593289345502853, 0.9765543937683105]

In [12]:
text1 = "The worker fell from a height and was taken to the hospital"
text2 = "The worker's hand went under the press and her fingers were cut off."

seq1 = tokenizer.texts_to_sequences([text1])
pad1 = pad_sequences(seq1, maxlen = max_sequence_length, padding = 'pre', truncating= 'pre')
seq2 = tokenizer.texts_to_sequences([text2])
pad2 = pad_sequences(seq2, maxlen = max_sequence_length, padding = 'pre', truncating= 'pre')

print(model.predict(pad1))
print(model.predict(pad2))

[[0.00108384]]
[[0.9883279]]


# END =)