In [1]:
import re
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, accuracy_score, f1_score, recall_score, precision_score
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from numpy import mean, std

In [2]:
def calc_roc(confusion, name, model, x_train, y_train, x_test, y_test):
    tn, fp, fn, tp = confusion.ravel()
    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(x_test)[:, 1])
    auc = roc_auc_score(y_test, model.predict_proba(x_test)[:, 1])
    results.append([fpr, tpr, name + ", AUC={:.3f}".format(auc)])

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
feature_type = '1'

#1 -> just sentence
#2 -> all text

In [5]:
target_type = 1
#1 -> pos

In [6]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

In [7]:
train.head()

Unnamed: 0,text,sentence,snp,phenotype,ASSOCIATION,CONFIDENCE
0,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,nicotine metabolism,neutral,zero
1,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,fetal growth characteristics,positive,weak
2,OBJECTIVE: Maternal smoking during pregnancy i...,RESULTS: Among mothers who did not smoke durin...,rs1051730,fetal growth characteristic,negative,-
3,OBJECTIVE: Maternal smoking during pregnancy i...,Among mothers who continued smoking during pre...,rs1051730,head circumference,negative,-
4,OBJECTIVE: Maternal smoking during pregnancy i...,The T-allele of maternal rs1051730 was associa...,rs1051730,second and third trimester fetal femur length,positive,weak


In [8]:
train['label'] = train['CONFIDENCE']
test['label']  = test['CONFIDENCE']

In [9]:
train.dropna(subset=['sentence'], inplace=True)
test.dropna(subset=['sentence'], inplace=True)

In [10]:
train.ASSOCIATION.value_counts()

positive    701
neutral     142
negative     91
Name: ASSOCIATION, dtype: int64

In [11]:
train.CONFIDENCE.value_counts()

weak        375
strong      218
zero        142
moderate    108
-            91
Name: CONFIDENCE, dtype: int64

In [12]:
train = train[train.ASSOCIATION == 'positive']
test  =  test[test.ASSOCIATION  == 'positive']

In [13]:
train.CONFIDENCE.value_counts()

weak        375
strong      218
moderate    108
Name: CONFIDENCE, dtype: int64

In [14]:
res = pd.read_excel('res.xlsx')

In [15]:
def vectorize_bow(df_X, cv):
    bow_X = cv.transform(df_X)
    bow_df = pd.DataFrame(data=bow_X.toarray(),
                          columns=[cv.get_feature_names_out()])
    return bow_df

def define_and_run_model(model, X_train, X_test, y_train):
    clf = model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def show_result(y_pred, y_test):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, digits=3))
    print(accuracy_score(y_test, y_pred))
    
def save_res(model_name, feature_type, target_type, y_pred_sr, y_test_sr, idx):
    res_pre = precision_score(y_test_sr, y_pred_sr, average='weighted')
    res_rec = recall_score(y_test_sr, y_pred_sr, average='weighted')
    res_f1 = f1_score(y_test_sr, y_pred_sr, average='weighted')
    res.at[idx, 'model-name'] = model_name
    res.at[idx, 'feature-type'] = feature_type
    res.at[idx, 'target-type'] = target_type
    res.at[idx, 'accuracy'] = accuracy_score(y_test_sr, y_pred_sr)
    res.at[idx, 'precision'] = res_pre
    res.at[idx, 'recall'] = res_rec
    res.at[idx, 'f1-score'] = res_f1

In [16]:
possible_labels = train.label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

train['label'] = train.label.replace(label_dict)
test['label']  = test.label.replace(label_dict)

In [17]:
df = pd.concat([train, test])

In [18]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,text,sentence,snp,phenotype,ASSOCIATION,CONFIDENCE,label
0,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,fetal growth characteristics,positive,weak,0
1,OBJECTIVE: Maternal smoking during pregnancy i...,The T-allele of maternal rs1051730 was associa...,rs1051730,second and third trimester fetal femur length,positive,weak,0
2,OBJECTIVE: Maternal smoking during pregnancy i...,The T-allele of maternal rs1051730 was associa...,rs1051730,smaller birth length,positive,weak,0
3,OBJECTIVE: Maternal smoking during pregnancy i...,The maternal T-allele of rs1051730 was associ...,rs1051730,lower third trimester estimated fetal weight [...,positive,weak,0
4,OBJECTIVE: Maternal smoking during pregnancy i...,The maternal T-allele of rs1051730 was associ...,rs1051730,birth weight,positive,weak,0
...,...,...,...,...,...,...,...
866,Metabolic syndrome (MetS) is a common multifac...,MetS and control allelic frequencies for rs238...,rs1333049,MetS,positive,moderate,2
867,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs6152,endometrial cancer,positive,weak,0
868,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs1204038,endometrial cancer,positive,weak,0
869,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs1337082,endometrial cancer,positive,weak,0


In [19]:
bow_vectorizer = CountVectorizer(lowercase=False, 
                                 ngram_range=(1, 1),
                                 max_df=.80,
                                 min_df=1)
bow_vectorizer.fit(df['sentence'])

In [20]:
# train_bow_df  = vectorize_bow(list(train['sentence']), bow_vectorizer)
# test_bow_df   = vectorize_bow(list(test['sentence']), bow_vectorizer)

In [21]:
# X_train = train_bow_df
# y_train = train['label']

In [22]:
# X_test = test_bow_df
# y_test = test['label']

In [23]:
# X_train.shape

In [24]:
# y_train.shape

In [25]:
# X_test.shape

In [26]:
# y_test.shape

In [27]:
# test['label'].value_counts()

In [28]:
X = vectorize_bow(list(df['sentence']), bow_vectorizer)
y = df['label']

In [29]:
k = 5
kf = KFold(n_splits=k, random_state=1, shuffle=True)

In [None]:
def kfold_test(model, model_name, feature_type, target_type, idx):
    pre_scores = []
    rec_scores = []
    f1_scores  = []

    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]

        model.fit(X_train,y_train)
        pred_values = model.predict(X_test)

        res_pre = precision_score(y_test, pred_values, average='weighted')
        res_rec = recall_score(y_test, pred_values, average='weighted')
        res_f1 = f1_score(y_test, pred_values, average='weighted')
        
        pre_scores.append(res_pre)
        rec_scores.append(res_rec)
        f1_scores.append(res_f1)

    avg_pre_score = sum(pre_scores)/k
    avg_rec_score = sum(rec_scores)/k
    avg_f1_score  = sum(f1_scores)/k
    
    std_pre_score = std(pre_scores)
    std_rec_score = std(rec_scores)
    std_f1_score  = std(f1_scores)

    print('precision of each fold - {}'.format(pre_scores))
    print('Avg precision : {}'.format(avg_pre_score))
    
    print('recall of each fold - {}'.format(rec_scores))
    print('Avg recall : {}'.format(avg_rec_score))

    print('f1 of each fold - {}'.format(f1_scores))
    print('Avg f1 : {}'.format(avg_f1_score))
    
    res.at[idx, 'model-name'] = model_name
    res.at[idx, 'feature-type'] = feature_type
    res.at[idx, 'target-type'] = target_type
    
    res.at[idx, 'precision'] = avg_pre_score
    res.at[idx, 'precision-std'] = std_pre_score

    res.at[idx, 'recall'] = avg_rec_score
    res.at[idx, 'recall-std'] = std_rec_score

    res.at[idx, 'f1-score'] = avg_f1_score
    res.at[idx, 'f1-score-std'] = std_f1_score
    
#     confusion_matrix = metrics.confusion_matrix(yy, predict)
#     calc_roc(confusion_matrix, model_name, SVC_linear_classifier, x_train, y_train, x_test, y_test)

In [None]:
%%time
smodel = SVC(kernel='rbf')
kfold_test(smodel, 'svm', feature_type, target_type, len(res))

In [None]:
%%time
lr = LogisticRegression(random_state=0,
                        penalty='l2',
                        solver='newton-cg',
                        C=1.2,
                        max_iter=20)
kfold_test(lr, 'LogisticRegression', feature_type, target_type, len(res))

In [None]:
%%time
rf_model = RandomForestClassifier(n_estimators=100)
kfold_test(rf_model, 'RandomForest', feature_type, target_type, len(res))

In [None]:
%%time
knn_model = KNeighborsClassifier(n_neighbors=2)
kfold_test(knn_model, 'knn', feature_type, target_type, len(res))

In [None]:
%%time
nb_model = GaussianNB()
kfold_test(nb_model, 'GaussianNB', feature_type, target_type, len(res))

In [None]:
%%time
dt_model = DecisionTreeClassifier(random_state=0)
kfold_test(dt_model, 'DecisionTree', feature_type, target_type, len(res))

In [None]:
%%time
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
kfold_test(gb_model, 'GradientBoosting', feature_type, target_type, len(res))

In [None]:
import os
os.remove("res.xlsx") 

res.to_excel('res.xlsx', index=False)

In [None]:
fig = plt.figure(figsize=(8, 6))

for res in results:
    plt.plot(res[0], res[1], label=res[2])

plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curves', fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.savefig('res/roc_curves.png', dpi=500)
plt.show()