In [1]:
import pandas as pd
import numpy as np
import nltk
import os
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report, f1_score, recall_score, precision_score
import matplotlib.pyplot as plt
import scikitplot as skplt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aom2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
for f in os.listdir("content"):
    print(f)

BinMLResultsada_tfidf_binmat.png
BinMLResultsgb_tfidf_binmat.png
BinMLResultsknn_tfidf_binmat.png
BinMLResultslr_tfidf_binmat.png
BinMLResultsnb_tfidf_binmat.png
BinMLResultsrf_tfidf_binmat.png
BinMLResultssvm_countvec_binmat.png
BinMLResultssvm_tfidf_binmat.png
final_testset.xlsx
final_trainset.xlsx


In [3]:
df_bin = pd.read_excel("content/final_trainset.xlsx")
df_bin['binary2_label'].value_counts()
df_test = pd.read_excel("content/final_testset.xlsx")
df_bin['ABSTRACT'] = df_bin['ABSTRACT'].apply(str.lower)
df_test['ABSTRACT'] = df_test['ABSTRACT'].apply(str.lower)

In [4]:
list(df_test['ABSTRACT'])

['all descriptions of the lambert-eaton syndrome are based on symptomatic patients. we incidentally encountered electromyographic evidence of this syndrome in a patient being studied for another reason. the patient had controlled hypothyroidism and smoked for 9 pack-years but never had symptoms of weakness or had evidence of cancer. detailed electrophysiological and serological evaluations were undertaken in this study. neuromuscular transmission tests were consistent with a presynaptic disorder. autoantibodies specific for p/q-type voltage-gated calcium channels (vgcc) were detected in the serum and persisted throughout 7 years of follow-up. multiple igg markers of thyrogastric autoimmunity also were detected. this demonstration of asymptomatic lambert-eaton syndrome persisting for 8 years provided novel insight into the natural history of the disorder. it is plausible that patients with a subclinical syndrome would have abrupt neuromuscular decompensation when challenged with medicat

In [28]:
def bin_model(model_type, vectortype,C=1,ngram_range=(1, 1),stop_words=None):
    
    if vectortype == "tfidf":
        vectorizer = TfidfVectorizer(ngram_range=ngram_range,stop_words=stop_words)
    elif vectortype == "countvec":
        vectorizer = CountVectorizer(ngram_range=ngram_range,stop_words=stop_words)

    bow_train = vectorizer.fit_transform(list(df_bin['ABSTRACT']))
    feature_names = vectorizer.get_feature_names_out()

    X_train = bow_train
    y_train = df_bin['binary2_label']
     # Print the feature names
#     print("Feature names:", feature_names)

    if model_type == 'nb':
        model = MultinomialNB().fit(X_train, y_train)
    elif model_type == 'lr':
        model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    elif model_type == 'svm':
        model = LinearSVC(C=C).fit(X_train, y_train)
    elif model_type == 'knn':
        model = KNeighborsClassifier(n_neighbors=7).fit(X_train, y_train)
    elif model_type == 'rf':
        model = RandomForestClassifier().fit(X_train, y_train)
    elif model_type == 'ada':
        model = AdaBoostClassifier(n_estimators=50).fit(X_train, y_train)
    elif model_type == 'gb':
        model = GradientBoostingClassifier(n_estimators=50).fit(X_train, y_train)

    print('Training Accuracy:', model.score(X_train, y_train))

    bow_test = vectorizer.transform(list(df_test['ABSTRACT']))
    X_test = bow_test
    y_test = df_test['binary2_label']
    y_pred = model.predict(X_test)
    
    acc=accuracy_score(y_test, y_pred)
    f1=f1_score(y_test, y_pred, average="macro")
    pre=precision_score(y_test, y_pred, average="macro")
    rc=recall_score(y_test, y_pred, average="macro")
    

#     print('Test Accuracy:', acc)
# #     print('F1 score:', f1_score(y_test, y_pred, average="macro"))
#     print("F1 score:",f1)
#     print('Precision:', pre)
#     print('Recall:', rc)

#     fig = plt.figure(figsize=(15,6))
#     ax1 = fig.add_subplot(121)

#     skplt.metrics.plot_confusion_matrix(y_test, y_pred,
#                                     title="Binary Confusion Matrix",
#                                     ax=ax1)
#     plt.savefig('content/BinMLResults' + model_type + '_' + vectortype + '_binmat.png')
    # return y_test, y_pred
    return X_train,y_train,feature_names,{"accuracy":acc,"f1_score":f1,"precision": pre,"recall":rc}
#     return f1_score,accuracy_score

In [29]:
results=[]
for C in [0.01,0.05,0.1,0.5,1,2]:
    for vec in ["tfidf","countvec"]:
        for ngram_range in [(1,1),(1,2),(2,2),(1,3),(2,3),(3,3)]:
            
            X_train,y_train,feature_names,r=bin_model('svm', vec,C=C,ngram_range=ngram_range,stop_words='english')
            results.append([r["accuracy"],r["f1_score"],r["f1_score"],r["precision"],r["recall"],C,vec,ngram_range])
            print(C,vec,ngram_range,r)




Training Accuracy: 0.726836657409745




Training Accuracy: 0.6586720525119919




Training Accuracy: 0.5746023731380965


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training Accuracy: 0.605023983842464




Training Accuracy: 0.5746023731380965


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training Accuracy: 0.5746023731380965


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training Accuracy: 0.9925523857611714




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 0.82239333501641




Training Accuracy: 0.8375410249936884




Training Accuracy: 0.778969957081545




Training Accuracy: 0.8390557939914163




Training Accuracy: 0.7298661954052007




Training Accuracy: 0.5941681393587478




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 0.861777328957334




Training Accuracy: 0.8911890936632164




Training Accuracy: 0.928174703357738




Training Accuracy: 0.9072203988891694




Training Accuracy: 0.9556930068164605




Training Accuracy: 0.9916687705124968




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 0.9628881595556678




Training Accuracy: 0.9959606160060591




Training Accuracy: 0.9997475385003787




Training Accuracy: 0.9983590002524615




Training Accuracy: 0.9998737692501893




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 0.9864933097702601




Training Accuracy: 0.9997475385003787




Training Accuracy: 1.0




Training Accuracy: 0.9998737692501893




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 0.9972229235041656




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0




Training Accuracy: 1.0


In [31]:
rdf=pd.DataFrame(results,columns=["accuracy","f1_score","f1_score","precision","recall","C","vectorize","ngram_range"])

In [34]:
rdf.to_csv("remove_stopwords_multigram_testings.csv")

NameError: name 'hi' is not defined