<a href="https://colab.research.google.com/github/emondsarker/classifying-medical-misinfo-with-Machine-Learning/blob/main/SVM_works.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import time
import re

In [2]:
!pip install nltk
nltk.download('all')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

True

In [3]:
np.random.seed(500)

In [4]:
Corpus = pd.read_csv("/content/drive/MyDrive/cse445_medical_misinfo/NLP.csv")

In [5]:
Corpus['Translation'].astype(str)

0       Eat the leaves of diabetes tree / gynura tree,...
1       No morning or afternoon insulin or tablets, th...
2       Removed by Qur'anic practices and Unani and Ho...
3       Masturbation leads to premature ejaculation, w...
4                  Masturbation causes regular headaches.
                              ...                        
1490    Psoriasis, iron deficiency anemia, diabetes, h...
1491    Cut nails after shower. Always keep nails clea...
1492    When going to a salon for nail care, make sure...
1493    The corners of both sides of the nail cannot b...
1494    Shoes that cause pain or pressure on the nails...
Name: Translation, Length: 1495, dtype: object

In [6]:
Corpus.head()

Unnamed: 0,Translation,Target
0,"Eat the leaves of diabetes tree / gynura tree,...",1
1,"No morning or afternoon insulin or tablets, th...",1
2,Removed by Qur'anic practices and Unani and Ho...,1
3,"Masturbation leads to premature ejaculation, w...",1
4,Masturbation causes regular headaches.,1


In [7]:
Corpus['Translation'] = Corpus['Translation'].fillna('').apply(str)

In [8]:
cleanedData = []

lemma = WordNetLemmatizer()
swords = stopwords.words("english")
for text in Corpus["Translation"]:
    
    # Cleaning links
    text = re.sub(r'http\S+', '', text)
    
    # Cleaning everything except alphabetical and numerical characters
    text = re.sub("[^a-zA-Z0-9]"," ",text)
    
    # Tokenizing and lemmatizing
    text = nltk.word_tokenize(text.lower())
    text = [lemma.lemmatize(word) for word in text]
    
    # Removing stopwords
    text = [word for word in text if word not in swords]
    
    # Joining
    text = " ".join(text)
    
    cleanedData.append(text)

In [9]:
for i in range(0,5):
    print(cleanedData[i],end="\n\n")

eat leaf diabetes tree gynura tree keep diabetes control

morning afternoon insulin tablet time diabetes completely controlled leaf plant rich foreign medicinal property

removed qur anic practice unani homeo treatment insha allah physical problem 1 sexual impotence 2 physical weakness 3 meh promeh disease 4 diabetes disease 5 jaundice

masturbation lead premature ejaculation hinders fertility full happiness marriage

masturbation cause regular headache



In [10]:
vectorizer = CountVectorizer(max_features=10000)
BOW = vectorizer.fit_transform(cleanedData)

In [11]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(BOW,np.asarray(Corpus["Target"]))

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1121, 3741)
(374, 3741)
(1121,)
(374,)


In [12]:
from sklearn.svm import SVC
start_time = time.time()

model = SVC()
model.fit(x_train,y_train)

end_time = time.time()
process_time = round(end_time-start_time,2)
print("Fitting SVC took {} seconds".format(process_time))

Fitting SVC took 0.26 seconds


In [13]:
predictions = model.predict(x_test)

In [14]:
from sklearn.metrics import accuracy_score,confusion_matrix

print("Accuracy of model is {}%".format(accuracy_score(y_test,predictions) * 100))

Accuracy of model is 77.00534759358288%


In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [16]:
model = MultinomialNB().fit(x_train, y_train)

In [17]:
y_pred = model.predict(x_test)

In [18]:
print('NB Accuracy:', accuracy_score(y_test,predictions))


NB Accuracy: 0.7700534759358288


In [19]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.74      0.83      0.78       187
           1       0.81      0.71      0.76       187

    accuracy                           0.77       374
   macro avg       0.77      0.77      0.77       374
weighted avg       0.77      0.77      0.77       374



In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [21]:
logisticRegr = LogisticRegression()

In [22]:
logisticRegr.fit(x_train, y_train)

LogisticRegression()

In [23]:
logisticRegr.predict(x_test[0].reshape(1,-1))

array([0])

In [24]:
predictions = logisticRegr.predict(x_test)

In [25]:
score = logisticRegr.score(x_test, y_test)
print(score)

0.7941176470588235


In [33]:
from sklearn.tree import DecisionTreeClassifier

In [37]:
DecisionTree_Class_Model= DecisionTreeClassifier()

In [38]:
DecisionTree_Class_Model.fit(x_train,y_train)

DecisionTreeClassifier()

In [39]:
predictions = DecisionTree_Class_Model.predict(x_test)

In [40]:
from sklearn.metrics import accuracy_score,confusion_matrix

print("Accuracy of model is {}%".format(accuracy_score(y_test,predictions) * 100))

Accuracy of model is 69.78609625668449%


In [60]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
#create new a knn model
knn = KNeighborsClassifier()
neigh = KNeighborsClassifier(n_neighbors=2)
neigh.fit(x_train, y_train)


KNeighborsClassifier(n_neighbors=2)

In [61]:
predictions = neigh.predict(x_test)

In [62]:
from sklearn.metrics import accuracy_score,confusion_matrix

print("Accuracy of model is {}%".format(accuracy_score(y_test,predictions) * 100))

Accuracy of model is 55.61497326203209%


In [76]:
from sklearn.ensemble import RandomForestClassifier
#create a new random forest classifier
clf = RandomForestClassifier(max_depth=30, random_state=0)

clf.fit(x_train, y_train)

RandomForestClassifier(max_depth=30, random_state=0)

In [77]:
predictions = clf.predict(x_test)

In [78]:
from sklearn.metrics import accuracy_score,confusion_matrix

print("Accuracy of model is {}%".format(accuracy_score(y_test,predictions) * 100))

Accuracy of model is 74.06417112299465%


In [84]:
print("knn: {}".format(neigh.score(x_test, y_test)))
print("rf: {}".format(clf.score(x_test, y_test)))
print("Log reg: {}".format(logisticRegr.score(x_test, y_test)))
print("Dec tree: {}".format(DecisionTree_Class_Model.score(x_test, y_test)))


knn: 0.5561497326203209
rf: 0.7406417112299465
Log reg: 0.7941176470588235
Dec tree: 0.6978609625668449


In [86]:
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [88]:
kfold = model_selection.KFold(n_splits=10, random_state=7,shuffle=True)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=7)
results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold)
print(results.mean())

0.7573719974715549


In [91]:
from sklearn.ensemble import AdaBoostClassifier
seed = 7
num_trees = 70
kfold = model_selection.KFold(n_splits=10, random_state=seed,shuffle=True)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, x_train,y_train , cv=kfold)
print(results.mean())

0.7430704804045513


In [92]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

kfold = model_selection.KFold(n_splits=10, random_state=seed,shuffle=True)
# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, x_train, y_train, cv=kfold)
print(results.mean())

0.7912768647281921
