In [1]:
import pandas as pd
import re, string, nltk
from nltk.corpus import stopwords
from imblearn.under_sampling import RandomUnderSampler

## Importing the dataset

In [2]:
dataset = pd.read_csv('drugsComTrain_raw.csv')

## text preprocessing

In [4]:

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = str(text)
    text = re.sub('['+string.punctuation+']', '', text)     
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

In [5]:
dataset['review_cleaned'] = dataset['review'].apply(clean_text)

In [7]:
dataset.head(5)

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,review_cleaned
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,side effect take combination bystolic 5 mg fis...
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,son halfway fourth week intuniv became concern...
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,used take another oral contraceptive 21 pill c...
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,first time using form birth control i039m glad...
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,suboxone completely turned life around feel he...


## Creating the Bag of Words model

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 500) #max_features = 'n' to reduce the feature matrix
X = cv.fit_transform(dataset['review_cleaned'].to_list()).toarray()
y = dataset.iloc[:, 4].values

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

# Using undersampling to balance the data 

In [8]:
#UNDER SAMPLING
undersample = RandomUnderSampler(sampling_strategy='majority')
X_over, y_over = undersample.fit_resample(X, y)

In [9]:
#checking balance of data
from collections import Counter
Counter(y_over)

Counter({0: 21619,
         1: 6931,
         2: 6513,
         3: 5012,
         4: 8013,
         5: 6343,
         6: 9456,
         7: 18890,
         8: 27531,
         9: 5012})

## Splitting the dataset into the Training set and Test set

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size = 0.20)


## Training the Random Forest model on the Training set

## Predicting the Test set results

In [46]:
from sklearn.neighbors import KNeighborsClassifier
# Applying k = 3, default Minkowski distance metrics
model = KNeighborsClassifier(n_neighbors=3)
# Training the classifier
model.fit(X_train,y_train)

In [57]:
y_pred = model.predict(X_test[:5000])

In [56]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
cm = confusion_matrix(y_test[:1000], y_pred)
print(cm)
accuracy_score(y_test[:1000], y_pred)

[[59  6  6  3  7  3  3  2  0  0]
 [ 6 92 17  2  2  2  1  0  0  0]
 [ 4  3 72  0  2  3  3  1  1  0]
 [ 4  3 19 63  2  1  1  0  0  0]
 [ 6  4 14  2 79  3  3  2  0  0]
 [ 4  5 13  3  4 59  2  0  1  1]
 [ 2  6  9  5  6  1 75  2  1  1]
 [ 4  4  8  3  8  4  7 55  2  0]
 [ 7  4  9  7  7  3  6  3 47  2]
 [15 15 10 13  7 11  5 13 10  5]]


0.606

In [58]:
cm = confusion_matrix(y_test[:5000], y_pred)
print(cm)
accuracy_score(y_test[:5000], y_pred)

[[329  30  54  20  21  17  20   7   4   1]
 [ 26 394  54   7   8  14  11   5   2   0]
 [ 23  20 408   9   9  10  12   2   3   0]
 [ 19  21  81 374   9   8   7   0   6   0]
 [ 27  22  45  10 367  11   8   3   0   1]
 [ 13  15  60  13  17 330  11   2   4   1]
 [ 22  20  47  15  29  13 329   5   7   3]
 [ 35  25  41  24  29  19  33 250  15   1]
 [ 33  47  40  33  29  25  35  28 237   5]
 [ 56  65  47  50  48  56  51  63  50  35]]


0.6106