In [1]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup

%matplotlib inline

In [2]:
dataset = pd.read_csv('datas\\final_dataset.csv')

In [3]:
dataset

Unnamed: 0,text,labels
0,What is (are) Glaucoma ?,MEDICAL
1,What causes Glaucoma ?,MEDICAL
2,What are the symptoms of Glaucoma ?,MEDICAL
3,What are the treatments for Glaucoma ?,MEDICAL
4,What is (are) Glaucoma ?,MEDICAL
...,...,...
82818,have a quiz in spanish class tonday .. but i ...,NON-MEDICAL
82819,@mittec no problems with it as such but I had ...,NON-MEDICAL
82820,Ow my head,NON-MEDICAL
82821,Anyway... Not particularly looking foreward to...,NON-MEDICAL


In [6]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

In [9]:
dataset['text'] = dataset['text'].apply(str)
dataset['text'] = dataset['text'].apply(clean_text)

In [10]:
dataset

Unnamed: 0,text,labels
0,glaucoma,MEDICAL
1,causes glaucoma,MEDICAL
2,symptoms glaucoma,MEDICAL
3,treatments glaucoma,MEDICAL
4,glaucoma,MEDICAL
...,...,...
82818,quiz spanish class tonday think pass,NON-MEDICAL
82819,mittec problems literally spent 2 hours cleani...,NON-MEDICAL
82820,ow head,NON-MEDICAL
82821,anyway particularly looking foreward labs toda...,NON-MEDICAL


In [12]:
X = dataset['text']
y = dataset['labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((66258,), (16565,), (66258,), (16565,))

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

In [16]:
%%time
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=['MEDICAL', 'NON-MEDICAL']))

accuracy 0.9917295502565651
              precision    recall  f1-score   support

     MEDICAL       0.99      1.00      0.99      8563
 NON-MEDICAL       1.00      0.99      0.99      8002

    accuracy                           0.99     16565
   macro avg       0.99      0.99      0.99     16565
weighted avg       0.99      0.99      0.99     16565

CPU times: total: 1.67 s
Wall time: 1.67 s


In [20]:
import pickle

In [21]:
pickle.dump(nb, open('model.pkl', 'wb'))