## Basic PreProcessing for Scraped MTSamples.com Data

Data Source: mtsamples.com

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# importing scraped data
raw_data = pd.read_csv('../scrapy/ENV/mtsamples/mt.csv')
raw_data.sample(5)

Unnamed: 0,description,medical_specialty,sample_name,transcription
1895,Repair of total anomalous pulmonary venous co...,Pediatrics - Neonatal,Septal Defect Repair,"TITLE OF OPERATION:,1. Repair of total anomal..."
3412,"Hyperglycemia, cholelithiasis, obstructive sl...",General Medicine,Discharge Summary - 11,"ADMISSION DIAGNOSES: , Hyperglycemia, cholelit..."
919,Debridement of left lateral foot ulcer with e...,Surgery,Debridement - Foot Ulcer,"PREOPERATIVE DIAGNOSES,1. Left lateral fifth ..."
3592,"Diagnostic laparotomy, exploratory laparotomy...",Gastroenterology,Diverticulectomy & Laparotomy,"PREOPERATIVE DIAGNOSIS: , Acute appendicitis.,..."
787,"Fogarty thrombectomy, left forearm arterioven...",Surgery,Fogarty Thrombectomy,"PREOPERATIVE DIAGNOSES:,1. Chronic renal fail..."


In [3]:
raw_data.shape

(4999, 4)

In [4]:
# checking a transcription text
raw_data.iloc[0, 3]

'SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without adenopathy.,

In [5]:
# basic cleaning - removing \n \r \t
raw_data.transcription.replace('\n', '', regex=True, inplace=True)
raw_data.transcription.replace('\r', '', regex=True, inplace=True)
raw_data.transcription.replace('\t', '', regex=True, inplace=True)
raw_data.iloc[0, 3]

'SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without adenopathy.,

In [6]:
# copy raw data to new dataframe before any more cleaning
df = raw_data.copy()

In [7]:
# separate keywords out into their own column
df['transcription'] = raw_data.transcription.str.split('Keywords').str[0]
df['keywords'] = raw_data.transcription.str.split('Keywords:').str[1]
df.head()

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...",", allergy / immunology, allergic rhini..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...",", bariatrics, laparoscopic gastric byp..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...",", bariatrics, laparoscopic gastric byp..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...",", cardiovascular / pulmonary, 2-d m-mo..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,", cardiovascular / pulmonary, 2-d, dop..."


In [8]:
# checking how our transcription column looks now
df.iloc[0, 3]

'SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without adenopathy.,

In [9]:
# splittig off the unneeded adsbygoogle text
df['transcription'] = raw_data.transcription.str.split('   ').str[0]
df.iloc[0, 3]

'SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without adenopathy.,

In [10]:
# checking the new keywords column
df.iloc[0, 4]

' ,        allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, erythematous, allegra, sprays, allergic,                           , , , , , ,        ,      ,      ,          ,            ,            ,            '

In [11]:
# cleaning up the keywords
df['keywords'] = df['keywords'].str.replace('  ', '')
df['keywords'] = df['keywords'].str.replace(' ,', '')
df['keywords'] = df['keywords'].str.replace(',,', '')
df.iloc[1, 4]

"bariatrics, laparoscopic gastric bypass, weight loss programs, gastric bypass, atkin's diet, weight watcher's, body weight, laparoscopic gastric, weight loss, pounds, months, weight, laparoscopic, band, loss, diets, overweight, lost"

In [12]:
# and viewing our cleaned up data
df.sample(5)

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
3559,Chronic abdominal pain and heme positive stoo...,Gastroenterology,Esophagogastroduodenoscopy - 8,"PREOPERATIVE DIAGNOSIS:, Chronic abdominal pai...","gastroenterology, endoscopy, gastritis, clo, h..."
1712,"Evaluate for retroperitoneal hematoma, the pa...",Radiology,CT Abdomen & Pelvis - 10,CT ABDOMEN WITHOUT CONTRAST AND CT PELVIS WITH...,"radiology, cystic lesion, superior pole, kidne..."
923,"DDDR permanent pacemaker, insertion of a ster...",Surgery,DDDR Permanent Pacemaker,"PROCEDURES PERFORMED:,1. DDDR permanent pacem...",
3912,Painful right knee status post total knee art...,Discharge Summary,Knee Arthroplasty - Discharge Summary,"ADMISSION DIAGNOSIS:, Painful right knee stat...","discharge summary, painful right knee, total k..."
947,Cystoscopy and Bladder biopsy with fulguratio...,Surgery,Cystoscopy & Bladder Biopsy,"PREOPERATIVE DIAGNOSIS:, History of bladder t...","surgery, bladder biopsy with fulguration, iv s..."


In [13]:
# the keywords column contains a lot of null values?!
# going back and researching the scraper didnt collect keywords for about 1/5th of the transcriptions
# it looks like some of the pages had different formatting and the keywords are in a separate div
df[df.isnull().any(axis=1)]

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
12,Cerebral Angiogram - moyamoya disease.,Neurology,Moyamoya Disease,"CC:, Confusion and slurred speech.,HX , (prima...",
24,Blood in urine - Transitional cell cancer of ...,Urology,Urology Consut - 1,"CHIEF COMPLAINT:,",
31,This is a 66-year-old male with signs and sym...,Urology,Urinary Retention,"CHIEF COMPLAINT:, Urinary retention.,HISTORY ...",
32,Right distal ureteral calculus. The patient ...,Urology,Ureteral Calculus - Consult,"CHIEF COMPLAINT: , Right distal ureteral calcu...",
39,The patient has a possibly torsion detorsion ...,Urology,Testicular Pain,"CHIEF COMPLAINT: , Testicular pain.,HISTORY OF...",
49,A 16-month-old with history of penile swellin...,Urology,Pubic Cellulitis,"DIAGNOSIS: , Pubic cellulitis.,HISTORY OF PRES...",
52,A 65-year-old man with chronic prostatitis re...,Urology,Prostatitis - Recheck,"SUBJECTIVE:, The patient is a 65-year-old man...",
58,The patient returns for followup evaluation 2...,Urology,Prostate Fossa Irradiation - Followup,"HISTORY OF PRESENT ILLNESS: , The patient retu...",
64,Adenocarcinoma of the prostate. The patient ...,Urology,Prostate Adenocarcinoma,"ADMISSION DIAGNOSIS: ,Adenocarcinoma of the p...",
67,"Penile discharge, infected-looking glans. A ...",Urology,Penile Discharge,"CHIEF COMPLAINT: , Penile discharge, infected-...",


In [15]:
# saving our cleaned up dataframe to csv
df.to_csv('mtsamples.csv')