In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [2]:
data= pd.read_csv('data/medical_new.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,medical_specialty,transcription,medical_label,word2vec_embed,fasttext_label
0,3,cardiovascular_pulmonary,d m mode leave atrial enlargement left atrial ...,0,[-3.9130253e-01 1.6925055e+00 -9.7060895e-01 ...,__label__cardiovascular_pulmonary d m mode lea...
1,4,cardiovascular_pulmonary,left ventricular cavity size wall thickness ap...,0,[-5.44876158e-01 1.30335271e+00 -1.53507555e+...,__label__cardiovascular_pulmonary left ventric...
2,7,cardiovascular_pulmonary,d echocardiogram multiple view heart great ves...,0,[-0.40531728 0.8317133 -1.0865903 0.296900...,__label__cardiovascular_pulmonary d echocardio...
3,9,cardiovascular_pulmonary,description normal cardiac chamber size normal...,0,[-0.3478441 0.9546592 -1.0535976 1.201930...,__label__cardiovascular_pulmonary description ...
4,11,cardiovascular_pulmonary,d study mild aortic stenosis widely calcify mi...,0,[ 0.27329454 1.1477208 -0.88355297 1.073992...,__label__cardiovascular_pulmonary d study mild...


In [3]:
def help(dataframe, vectorizer, model):
    x_train, x_test, y_train, y_test= train_test_split(dataframe.transcription, dataframe.medical_label, stratify= dataframe.medical_label, test_size= 0.2, random_state= 5354)
    
    model= Pipeline([
        ('vectorizer', vectorizer),
        ('model', model)
    ])

    model.fit(x_train, y_train)
    print(model.score(x_test, y_test))

    y_pred= model.predict(x_test)
    print(classification_report(y_test, y_pred))

### COUNT VECTORIZER

#### Naive Bayes

In [4]:
help(data, CountVectorizer(), MultinomialNB())

0.6916666666666667
              precision    recall  f1-score   support

           0       0.78      0.84      0.81        56
           1       0.97      0.87      0.92        39
           2       0.49      0.56      0.52        34
           3       0.87      0.75      0.81        61
           4       0.38      0.40      0.39        50

    accuracy                           0.69       240
   macro avg       0.70      0.68      0.69       240
weighted avg       0.71      0.69      0.70       240



### COUNT VECTORIZER WITH BOW

##### bigram

#### Naive Bayes

In [5]:
help(data, CountVectorizer(ngram_range= (1, 2)), MultinomialNB())

0.6375
              precision    recall  f1-score   support

           0       0.74      0.80      0.77        56
           1       0.94      0.79      0.86        39
           2       0.44      0.47      0.46        34
           3       0.77      0.80      0.78        61
           4       0.26      0.24      0.25        50

    accuracy                           0.64       240
   macro avg       0.63      0.62      0.62       240
weighted avg       0.64      0.64      0.64       240



##### trigram

#### naive bayes

In [6]:
help(data, CountVectorizer(ngram_range= (1, 3)), MultinomialNB())

0.6291666666666667
              precision    recall  f1-score   support

           0       0.74      0.80      0.77        56
           1       0.94      0.79      0.86        39
           2       0.42      0.44      0.43        34
           3       0.77      0.80      0.78        61
           4       0.24      0.22      0.23        50

    accuracy                           0.63       240
   macro avg       0.62      0.61      0.61       240
weighted avg       0.63      0.63      0.63       240



## TFIDF VECTORIZER

##### Naive Bayes

In [7]:
help(data, TfidfVectorizer(), MultinomialNB())

0.6416666666666667
              precision    recall  f1-score   support

           0       0.71      0.86      0.77        56
           1       0.96      0.69      0.81        39
           2       0.43      0.26      0.33        34
           3       0.69      0.89      0.78        61
           4       0.36      0.32      0.34        50

    accuracy                           0.64       240
   macro avg       0.63      0.60      0.60       240
weighted avg       0.63      0.64      0.63       240



## TFIDF VECTORIZER WITH BOW

#### bigram

##### naive bayes

In [8]:
help(data, TfidfVectorizer(ngram_range= (1, 2)), MultinomialNB())

0.5875
              precision    recall  f1-score   support

           0       0.69      0.80      0.74        56
           1       0.96      0.62      0.75        39
           2       0.31      0.15      0.20        34
           3       0.62      0.87      0.73        61
           4       0.29      0.28      0.28        50

    accuracy                           0.59       240
   macro avg       0.57      0.54      0.54       240
weighted avg       0.58      0.59      0.57       240



#### trigram

#### naive bayes

In [9]:
help(data, TfidfVectorizer(ngram_range= (1, 3)), MultinomialNB())

0.5625
              precision    recall  f1-score   support

           0       0.70      0.79      0.74        56
           1       0.96      0.59      0.73        39
           2       0.21      0.09      0.12        34
           3       0.61      0.84      0.70        61
           4       0.25      0.28      0.27        50

    accuracy                           0.56       240
   macro avg       0.55      0.52      0.51       240
weighted avg       0.56      0.56      0.54       240



## SMOTE ANALYSIS

In [10]:
from imblearn.over_sampling import SMOTE 

In [11]:
def smote(dataframe, vectorizer, model):

    #features and target
    x= vectorizer.fit_transform(dataframe['transcription'])
    y= dataframe['medical_label']

    #smote apply
    smote = SMOTE()
    X_sm, y_sm = smote.fit_resample(x,y)
    
    #train test split
    x_train, x_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=5, stratify=y_sm)
    model.fit(x_train, y_train)
    print(model.score(x_test, y_test)) 
    
    y_pred= model.predict(x_test)
    print(classification_report(y_test, y_pred))

In [13]:
#tfidf BOW
smote(data, TfidfVectorizer(ngram_range=(1, 2)), MultinomialNB())

0.7557755775577558
              precision    recall  f1-score   support

           0       0.73      0.87      0.79        60
           1       0.89      0.90      0.89        61
           2       0.74      0.84      0.78        61
           3       0.79      0.87      0.83        61
           4       0.53      0.30      0.38        60

    accuracy                           0.76       303
   macro avg       0.74      0.75      0.74       303
weighted avg       0.74      0.76      0.74       303



In [15]:
#tfidf
smote(data, TfidfVectorizer(), MultinomialNB())

0.7557755775577558
              precision    recall  f1-score   support

           0       0.75      0.88      0.81        60
           1       0.90      0.90      0.90        61
           2       0.73      0.75      0.74        61
           3       0.77      0.90      0.83        61
           4       0.54      0.33      0.41        60

    accuracy                           0.76       303
   macro avg       0.74      0.75      0.74       303
weighted avg       0.74      0.76      0.74       303



In [17]:
#countvectorizer
smote(data, CountVectorizer(), MultinomialNB())

0.7029702970297029
              precision    recall  f1-score   support

           0       0.70      0.87      0.78        60
           1       0.94      0.82      0.88        61
           2       0.67      0.66      0.66        61
           3       0.83      0.87      0.85        61
           4       0.35      0.30      0.32        60

    accuracy                           0.70       303
   macro avg       0.70      0.70      0.70       303
weighted avg       0.70      0.70      0.70       303



In [18]:
#countvectorizer with bow
smote(data, CountVectorizer(ngram_range=(1,2)), MultinomialNB())

0.6798679867986799
              precision    recall  f1-score   support

           0       0.70      0.80      0.74        60
           1       0.93      0.84      0.88        61
           2       0.68      0.59      0.63        61
           3       0.77      0.87      0.82        61
           4       0.32      0.30      0.31        60

    accuracy                           0.68       303
   macro avg       0.68      0.68      0.68       303
weighted avg       0.68      0.68      0.68       303

