# Medical Text Classifier

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import nltk
import string
import scipy.sparse as sp
from collections import Counter, defaultdict
from scipy.sparse import csr_matrix
import scipy as sp
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nikhilagrawal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nikhilagrawal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Read data from train and test data
with open("train.dat", "r") as fh:
    train_lines = fh.readlines()

with open("test.dat","r") as fh:
    test_lines = fh.readlines()

In [4]:
# Here we are spliting sentences in words (Tockenizing)
test_data = [l.split() for l in test_lines]
train_data = [l.split() for l in train_lines]

In [5]:
# Separating docs and labels
cls = []
docs = []
ind = 0
print(len(train_data))
for i in range(0,len(train_data)):
    cls.append(train_data[i][0])
    docs.append(train_data[i][1:])

14438


In [6]:
df = pd.DataFrame()

In [7]:
df['text']  = docs[:]
df['class'] = cls[:]

In [8]:
df.head()

Unnamed: 0,text,class
0,"[Catheterization, laboratory, events, and, hos...",4
1,"[Renal, abscess, in, children., Three, cases, ...",5
2,"[Hyperplastic, polyps, seen, at, sigmoidoscopy...",2
3,"[Subclavian, artery, to, innominate, vein, fis...",5
4,"[Effect, of, local, inhibition, of, gamma-amin...",4


In [7]:
# appending test data with train data for creating csr matrix


In [8]:
for i in range(0,len(test_data)):
    docs.append(test_data[i])

In [9]:
# removing stop words from docs
def remove_stop_words(docs):
    en_stops = set(stopwords.words('english'))
    new_docs = []
    for doc in docs:
        new_word = []  
        for word in doc:
            if word not in en_stops:
                new_word.append(word)
        new_docs.append(new_word)
            
    return new_docs

# removing punctuations from docs
import re
def remove_punctuation(docs):
    new_docs = []
    for doc in docs:
        new_words = []  
        for word in doc:
            new_word = re.sub(r'[^\w\s]', '', word)
            if new_word != '':
                new_words.append(new_word)
        new_docs.append(new_words)
            
    return new_docs

# removing words whose length is less than minLen
def filterLen(docs, minlen):
    r""" filter out terms that are too short. 
    docs is a list of lists, each inner list is a document represented as a list of words
    minlen is the minimum length of the word to keep
    """
    return [ [t for t in d if len(t) >= minlen ] for d in docs ]


# Lemmatizing
def lemmatize(docs):
    lemmatizer = WordNetLemmatizer()
    new_docs = []
    for doc in docs:
        lemmas = []  
        for word in doc:
            lemma = lemmatizer.lemmatize(word, pos='v')
            lemmas.append(lemma)
        new_docs.append(lemmas)
            
    return new_docs


# converting words to lower case and remove words which contain digits
def filterInput(documents):
    new_docs = []
    for doc in documents:
        new_word = []
        for word in doc:
            new_word.append(word.lower())
            for char in word:
                if(not char.isalpha()):
                    new_word.remove(word.lower())
                    break
        new_docs.append(new_word)
    
    return new_docs

In [10]:
# Filtering documents
docs1 = remove_stop_words(docs)
docs2 = remove_punctuation(docs1)
docs3 = filterLen(docs2,4)
docs4 = lemmatize(docs3)
docs5 = filterInput(docs4)


In [11]:
def build_matrix(docs):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    for d in docs:
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)
        
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat


def csr_info(mat, name="", non_empy=False):
    r""" Print out info about this CSR matrix. If non_empy, 
    report number of non-empty rows and cols as well
    """
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

In [12]:
# scale matrix and normalize its rows
def csr_idf(mat, copy=False, **kargs):
    r""" Scale a CSR matrix by idf. 
    Returns scaling factors as dict. If copy is True, 
    returns scaled matrix and scaling factors.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # document frequency
    df = defaultdict(int)
    for i in ind:
        df[i] += 1
    # inverse document frequency
    for k,v in df.items():
        df[k] = np.log(nrows / float(v))  ## df turns to idf - reusing memory
    # scale by idf
    for i in range(0, nnz):
        val[i] *= df[ind[i]]
        
    return df if copy is False else mat

def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat

In [14]:
# Creating csr matrix
mat4 = build_matrix(docs5)
print(csr_info(mat4))
mat5 = csr_idf(mat4, copy=True)
mat6 = csr_l2normalize(mat5, copy=True)
mat4 = build_matrix(docs5)
print(csr_info(mat6))

 [nrows 28880, ncols 55021, nnz 1916480]
None
 [nrows 28880, ncols 55021, nnz 1916480]
None


In [17]:
# spliting data for test purpose
train_doc = mat6[0:14438]
test_doc  = mat6[14438:]
train_class = cls[0:14438]
test_class = cls[14438:]

In [19]:
mat6.shape

(28880, 55021)

In [21]:
def classify(x, train, clstr,k=3):
        r""" Classify vector x using kNN and majority vote rule given training data and associated classes
        """
        # find nearest neighbors for x
        dots = x.dot(train.T)
        
        sims = list(zip(dots.indices, dots.data))
        if len(sims) == 0:
            # could not find any neighbors
            return '+' if np.random.rand() > 0.5 else '-'
        sims.sort(key=lambda x: x[1], reverse=True)
        tc = Counter(clstr[s[0]] for s in sims[:k]).most_common(2)
        if len(tc) < 2 or tc[0][1] > tc[1][1]:
            # majority vote
            return tc[0][0]
        # tie break
        tc = defaultdict(float)
        for s in sims[:k]:
            tc[clstr[s[0]]] += s[1]
        return sorted(tc.items(), key=lambda x: x[1], reverse=True)[0][0]

In [22]:
pred = []
f= open("Prediction2.5.dat","w+")
for test_ins in test_doc:
    pred_class = classify(test_ins,train_doc,train_class,20)
    f.write(str(pred_class) + "\n")
f.close()