In [2]:
## for data
import json
import pandas as pd
import numpy as np
## for processing
import re
import nltk
## for bag-of-words
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing
## for explainer
from lime import lime_text

In [5]:
# Path to the folder with the original data
train_csv_path = "./train_before_aug.csv"

df = pd.read_csv(train_csv_path)
print(df.shape)
df.drop(columns=['Unnamed: 0'], inplace=True)
df  = df.rename(columns={'transcription': 'text', 'medical_specialty': 'label'})
df.head()

(991, 3)


Unnamed: 0,label,text
0,Neurology,"PROCEDURE:, A 21-channel digital electroencep..."
1,Neurology,"REASON FOR VISIT:, Postoperative visit for cr..."
2,Radiology,"EXAM:, Nuclear medicine lymphatic scan.,REASO..."
3,Neurology,"HISTORY OF PRESENT ILLNESS: , The patient is a..."
4,Radiology,"EXAM:,MRI LEFT SHOULDER,CLINICAL:,This is a 51..."


In [6]:
df.value_counts('label')

label
 Orthopedic          284
 Radiology           218
 Gastroenterology    184
 Neurology           178
 Urology             127
Name: count, dtype: int64

In [7]:
# Function to preprocess the text
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [11]:
import nltk
stopwords = nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
df["text_clean"] = df["text"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=nltk.corpus.stopwords.words("english")))
df.head()

Unnamed: 0,label,text,text_clean
0,Neurology,"PROCEDURE:, A 21-channel digital electroencep...",procedure 21channel digital electroencephalogr...
1,Neurology,"REASON FOR VISIT:, Postoperative visit for cr...",reason visit postoperative visit craniopharyng...
2,Radiology,"EXAM:, Nuclear medicine lymphatic scan.,REASO...",exam nuclear medicine lymphatic scanreason exa...
3,Neurology,"HISTORY OF PRESENT ILLNESS: , The patient is a...",history present illness patient 36yearold fema...
4,Radiology,"EXAM:,MRI LEFT SHOULDER,CLINICAL:,This is a 51...",exammri left shoulderclinicalthis 51yearold fe...
