In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

#Set Max Columns/Rows
pd.set_option('display.max_columns' , 999)
pd.set_option('display.max_rows' , 999)

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [26]:
#Read in Data 
CP_original = pd.read_csv('ClassifiedProcedures.csv')
#split columns
df = CP_original['MedicalProcedureGroup'].str.split('[', expand=True, n=1)
CP_original['MedicalProcedureGroup_Front'] = df[0]
df_2 = CP_original['MedicalProcedureGroup_Front'].str.split('-', expand=True, n=1)
CP_original['MedicalProcedureGroup_Simple'] = df_2[0]
CP_original['MedicalProcedureGroup_Detail'] = df_2[1]

CP = CP_original.loc[:,('MedicalProcedure','MedicalProcedureGroup','MedicalProcedureGroupRisk','MedicalProcedureGroup_Front','MedicalProcedureGroup_Simple','MedicalProcedureGroup_Detail')]

In [27]:
#shuffle dataset 
CP = CP.sample(frac=1).reset_index(drop=True)

In [28]:
#Exclude Procedures that have less than two counts 
counts = pd.DataFrame(CP['MedicalProcedureGroup'].value_counts())
Exclude = counts[counts['MedicalProcedureGroup'] == 1].index.tolist()

In [30]:
#Procedures to Exclude 
CP = CP[~CP.MedicalProcedureGroup.isin(Exclude)] 
CP = CP.reset_index(drop=True)

In [31]:
#remove null values
CP = CP[CP['MedicalProcedure'].isnull()== False]

In [32]:
#split CP into train/test 
train_len = int(len(CP) * 0.85)
CP_train= CP.head(train_len)
CP_test = CP.tail(len(CP)-train_len)
CP_test = CP_test.reset_index(drop=True)

In [41]:
def find_best_cls(prediction_column):

    #Preprocessing
    X_columns = ['MedicalProcedure']
    y_columns = [prediction_column]

    #split data into Features and Target 
    X = CP_train.loc[:,X_columns].values
    y = CP_train.loc[:,y_columns].values

    X_test = CP_test.loc[:,X_columns].values
    y_test = CP_test.loc[:,y_columns].values
    
    #look for best classifier 
    cls_lst = {'MultinomialNB': MultinomialNB(), 
               'DecisionTreeClassifer':DecisionTreeClassifier(), 
               'SGD': SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3),
               'RandomForest':RandomForestClassifier(),
              }

    for name, classifier in cls_lst.items():
        cls = Pipeline([('vect', CountVectorizer(stop_words = 'english')), ('tfidf', TfidfTransformer(use_idf=False)),('clf', classifier)])
        cls.fit(X.ravel(), y.ravel())
        scores = cross_val_score(cls, X.ravel(), y.ravel(), cv=5)
   
    
    #best classifier
    clasifier_SGD = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42)

    cls_SGDC = Pipeline([('vect', CountVectorizer(stop_words = 'english')), ('tfidf', TfidfTransformer(use_idf=False)),('clf', clasifier_SGD)])
    cls_SGDC.fit(X.ravel(), y.ravel())
    predicted_SGD = cls_SGDC.predict(X_test.ravel())

    scores = np.mean(cross_val_score(cls, X.ravel(), y.ravel(), cv=5))
    
    test_score = accuracy_score(y_test,predicted_SGD)

    return (scores)
    
        
    

In [47]:
X_columns = ['MedicalProcedure']
y_columns = ['MedicalProcedureGroup']

#split data into Features and Target 
X = CP_train.loc[:,X_columns].values
y = CP_train.loc[:,y_columns].values

X_test = CP_test.loc[:,X_columns].values
y_test = CP_test.loc[:,y_columns].values

clasifier_SGD = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42)

cls_SGDC = Pipeline([('vect', CountVectorizer(stop_words = 'english')), ('tfidf', TfidfTransformer(use_idf=False)),('clf', clasifier_SGD)])
cls_SGDC.fit(X.ravel(), y.ravel())
predicted_SGD = cls_SGDC.predict(X_test.ravel())



#find the confidence levels 
decision_function = pd.DataFrame(cls_SGDC.decision_function(X_test.ravel()))
decision_function['max'] = decision_function.max(axis = 1)

In [None]:
#export to CSV 
predictions_table = pd.DataFrame({'MedicalProcedure':CP_test.MedicalProcedure, 
                                  'Actual':CP_test.MedicalProcedureGroup,
                                  'Actual_Simple':CP_test.MedicalProcedureGroup_Simple,
                                  'Prediction':predicted_SGD,
                                  'Confidence':decision_function['max']
                                 }).set_index('MedicalProcedure')
pd.DataFrame(predictions_table).to_csv('predicted_2.csv') 

## Test Data

In [59]:
# testing new dataset
to_be_classified = pd.read_excel('ePreop-IRMC-Cerner-Procedures.xlsx', sheet_name = 'CURRENT - Cerner')
new_categories = pd.read_excel('ePreop-IRMC-Cerner-Procedures.xlsx', sheet_name = 'Categories')

In [57]:
#import test data-- earlier test set 
# to_be_classified = pd.read_csv('ProceduresToBeClassified.csv')

In [60]:
#remove null values
to_be_classified = to_be_classified.loc[(to_be_classified['Procedure Description'].isnull()==False),:]

In [61]:
#create X 
X_test_final = to_be_classified.loc[:,('Procedure Description')].values

In [62]:
y_pred_final = cls_SGDC.predict(X_test_final)

In [63]:
decision_function = pd.DataFrame(cls_SGDC.decision_function(X_test_final.ravel()))
decision_function['max'] = decision_function.max(axis = 1)

In [64]:
predictions_table = pd.DataFrame({'MedicalProcedure':X_test_final, 
                                  'Prediction':y_pred_final,
                                  'Confidence':decision_function['max']
                                 }).set_index('MedicalProcedure')
pd.DataFrame(predictions_table).to_csv('predicted_test_IRMC_Cerner.csv')

## Testing Adding Field

In [None]:
predictions_table['same?'] = predictions_table.where(('Prediction' == 'Actual'))

In [None]:
np.where((predictions_table['Prediction'] == predictions_table['Actual']))