In [1]:
import re
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, recall_score, precision_score
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from numpy import mean, std

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
feature_type = '1'

#1 -> just sentence
#2 -> all text

In [4]:
target_type = 1
#1 -> pos

In [5]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

In [6]:
train.head()

Unnamed: 0,text,sentence,snp,phenotype,ASSOCIATION,CONFIDENCE
0,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,nicotine metabolism,neutral,zero
1,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,fetal growth characteristics,positive,weak
2,OBJECTIVE: Maternal smoking during pregnancy i...,RESULTS: Among mothers who did not smoke durin...,rs1051730,fetal growth characteristic,negative,-
3,OBJECTIVE: Maternal smoking during pregnancy i...,Among mothers who continued smoking during pre...,rs1051730,head circumference,negative,-
4,OBJECTIVE: Maternal smoking during pregnancy i...,The T-allele of maternal rs1051730 was associa...,rs1051730,second and third trimester fetal femur length,positive,weak


In [7]:
# train = train[~train.ASSOCIATION.isin(['neutral'])]
# test  = test[~test.ASSOCIATION.isin(['neutral'])]

dic = {"neutral": "negative"}

train.replace({"ASSOCIATION": dic}, inplace=True)
test.replace({"ASSOCIATION": dic}, inplace=True)

In [8]:
train['label'] = train['ASSOCIATION']
test['label']  = test['ASSOCIATION']

In [9]:
train.dropna(subset=['sentence'], inplace=True)
test.dropna(subset=['sentence'], inplace=True)

In [10]:
res = pd.read_excel('res.xlsx')

In [11]:
def vectorize_bow(df_X, cv):
    bow_X = cv.transform(df_X)
    bow_df = pd.DataFrame(data=bow_X.toarray(),
                          columns=[cv.get_feature_names_out()])
    return bow_df

def define_and_run_model(model, X_train, X_test, y_train):
    clf = model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def show_result(y_pred, y_test):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, digits=3))
    print(accuracy_score(y_test, y_pred))
    
def save_res(model_name, feature_type, target_type, y_pred_sr, y_test_sr, idx):
    res_pre = precision_score(y_test_sr, y_pred_sr, average='weighted')
    res_rec = recall_score(y_test_sr, y_pred_sr, average='weighted')
    res_f1 = f1_score(y_test_sr, y_pred_sr, average='weighted')
    res.at[idx, 'model-name'] = model_name
    res.at[idx, 'feature-type'] = feature_type
    res.at[idx, 'target-type'] = target_type
    res.at[idx, 'accuracy'] = accuracy_score(y_test_sr, y_pred_sr)
    res.at[idx, 'precision'] = res_pre
    res.at[idx, 'recall'] = res_rec
    res.at[idx, 'f1-score'] = res_f1

In [12]:
possible_labels = train.label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

train['label'] = train.label.replace(label_dict)
test['label']  = test.label.replace(label_dict)

In [13]:
df = pd.concat([train, test])

In [14]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,text,sentence,snp,phenotype,ASSOCIATION,CONFIDENCE,label
0,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,nicotine metabolism,negative,zero,0
1,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,fetal growth characteristics,positive,weak,1
2,OBJECTIVE: Maternal smoking during pregnancy i...,RESULTS: Among mothers who did not smoke durin...,rs1051730,fetal growth characteristic,negative,-,0
3,OBJECTIVE: Maternal smoking during pregnancy i...,Among mothers who continued smoking during pre...,rs1051730,head circumference,negative,-,0
4,OBJECTIVE: Maternal smoking during pregnancy i...,The T-allele of maternal rs1051730 was associa...,rs1051730,second and third trimester fetal femur length,positive,weak,1
...,...,...,...,...,...,...,...
1294,Metabolic syndrome (MetS) is a common multifac...,Analysis of 4 SNPs revealed a significant diff...,rs10757274,MetS,negative,-,0
1295,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs6152,endometrial cancer,positive,weak,1
1296,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs1204038,endometrial cancer,positive,weak,1
1297,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs1337082,endometrial cancer,positive,weak,1


In [15]:
bow_vectorizer = CountVectorizer(lowercase=False, 
                                 ngram_range=(1, 1),
                                 max_df=.80,
                                 min_df=1)
bow_vectorizer.fit(df['sentence'])

In [16]:
train_bow_df  = vectorize_bow(list(train['sentence']), bow_vectorizer)
test_bow_df   = vectorize_bow(list(test['sentence']), bow_vectorizer)

In [17]:
X_train = train_bow_df
y_train = train['label']

In [18]:
X_test = test_bow_df
y_test = test['label']

In [19]:
X_train.shape

(934, 2902)

In [20]:
y_train.shape

(934,)

In [21]:
X_test.shape

(365, 2902)

In [22]:
y_test.shape

(365,)

In [23]:
test['label'].value_counts()

0    195
1    170
Name: label, dtype: int64

In [24]:
X = vectorize_bow(list(df['sentence']), bow_vectorizer)
y = df['label']

In [25]:
k = 5
kf = KFold(n_splits=k, random_state=1, shuffle=True)

In [26]:
def kfold_test(model, model_name, feature_type, target_type, idx):
    pre_scores = []
    rec_scores = []
    f1_scores  = []

    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]

        model.fit(X_train,y_train)
        pred_values = model.predict(X_test)

        res_pre = precision_score(y_test, pred_values, average='weighted')
        res_rec = recall_score(y_test, pred_values, average='weighted')
        res_f1 = f1_score(y_test, pred_values, average='weighted')
        
        pre_scores.append(res_pre)
        rec_scores.append(res_rec)
        f1_scores.append(res_f1)

    avg_pre_score = sum(pre_scores)/k
    avg_rec_score = sum(rec_scores)/k
    avg_f1_score  = sum(f1_scores)/k
    
    std_pre_score = std(pre_scores)
    std_rec_score = std(rec_scores)
    std_f1_score  = std(f1_scores)

    print('precision of each fold - {}'.format(pre_scores))
    print('Avg precision : {}'.format(avg_pre_score))
    
    print('recall of each fold - {}'.format(rec_scores))
    print('Avg recall : {}'.format(avg_rec_score))

    print('f1 of each fold - {}'.format(f1_scores))
    print('Avg f1 : {}'.format(avg_f1_score))
    
    res.at[idx, 'model-name'] = model_name
    res.at[idx, 'feature-type'] = feature_type
    res.at[idx, 'target-type'] = target_type
    
    res.at[idx, 'precision'] = avg_pre_score
    res.at[idx, 'precision-std'] = std_pre_score

    res.at[idx, 'recall'] = avg_rec_score
    res.at[idx, 'recall-std'] = std_rec_score

    res.at[idx, 'f1-score'] = avg_f1_score
    res.at[idx, 'f1-score-std'] = std_f1_score

In [27]:
%%time
smodel = SVC(kernel='rbf')
kfold_test(smodel, 'svm', feature_type, target_type, len(res))

precision of each fold - [0.9109919028340082, 0.8796984265734266, 0.8685925887018409, 0.8930873795235498, 0.8829367947015005]
Avg precision : 0.8870614184668654
recall of each fold - [0.9115384615384615, 0.8730769230769231, 0.8692307692307693, 0.8923076923076924, 0.8841698841698842]
Avg recall : 0.886064746064746
f1 of each fold - [0.9099397879567563, 0.8700519017739562, 0.8634967847093745, 0.8896189620575942, 0.8820002391430962]
Avg f1 : 0.8830215351281554
Wall time: 9.28 s


In [28]:
%%time
lr = LogisticRegression(random_state=0,
                        penalty='l2',
                        solver='newton-cg',
                        C=1.2,
                        max_iter=20)
kfold_test(lr, 'LogisticRegression', feature_type, target_type, len(res))

precision of each fold - [0.9145833215937971, 0.8744873785634655, 0.8726098406098406, 0.8834615384615384, 0.8987157882185507]
Avg precision : 0.8887715734894386
recall of each fold - [0.9153846153846154, 0.8730769230769231, 0.8730769230769231, 0.8846153846153846, 0.8996138996138996]
Avg recall : 0.889153549153549
f1 of each fold - [0.9144043742298892, 0.8714432853846033, 0.8728286304768293, 0.883507956205526, 0.8989198895623476]
Avg f1 : 0.888220827171839
Wall time: 3.68 s


In [29]:
%%time
rf_model = RandomForestClassifier(n_estimators=100)
kfold_test(rf_model, 'RandomForest', feature_type, target_type, len(res))

precision of each fold - [0.911551423453997, 0.8800285068765258, 0.8667138009049774, 0.8959861399861401, 0.8668058019409371]
Avg precision : 0.8842171346325156
recall of each fold - [0.9115384615384615, 0.8769230769230769, 0.8692307692307693, 0.8961538461538462, 0.8687258687258688]
Avg recall : 0.8845144045144044
f1 of each fold - [0.9095330687440991, 0.8748680324798217, 0.8669894526277506, 0.8941972645254708, 0.8668104082468723]
Avg f1 : 0.8824796453248028
Wall time: 8.18 s


In [30]:
%%time
knn_model = KNeighborsClassifier(n_neighbors=2)
kfold_test(knn_model, 'knn', feature_type, target_type, len(res))

precision of each fold - [0.7460587509843836, 0.8742108262108261, 0.7998223998223999, 0.7538461538461538, 0.8440231831156574]
Avg precision : 0.8035922627958841
recall of each fold - [0.7307692307692307, 0.8615384615384616, 0.7730769230769231, 0.7538461538461538, 0.8185328185328186]
Avg recall : 0.7875527175527176
f1 of each fold - [0.7363107355088111, 0.8627839926190751, 0.7804249084249084, 0.7538461538461538, 0.8236399664971095]
Avg f1 : 0.7914011513792116
Wall time: 906 ms


In [31]:
%%time
nb_model = GaussianNB()
kfold_test(nb_model, 'GaussianNB', feature_type, target_type, len(res))

precision of each fold - [0.836140848398913, 0.8187439229810601, 0.8373522729442313, 0.8505004498425551, 0.8254865652824838]
Avg precision : 0.8336448118898486
recall of each fold - [0.8076923076923077, 0.8076923076923077, 0.8076923076923077, 0.8307692307692308, 0.7915057915057915]
Avg recall : 0.8090703890703892
f1 of each fold - [0.8138387880235705, 0.8093561089235206, 0.8144273099920646, 0.83465454916976, 0.7979739680770609]
Avg f1 : 0.8140501448371953
Wall time: 975 ms


In [32]:
%%time
dt_model = DecisionTreeClassifier(random_state=0)
kfold_test(dt_model, 'DecisionTree', feature_type, target_type, len(res))

precision of each fold - [0.8923945674747599, 0.8806089743589745, 0.833995841995842, 0.8917728105228107, 0.9024711656290604]
Avg precision : 0.8802486719962894
recall of each fold - [0.8884615384615384, 0.8807692307692307, 0.8346153846153846, 0.8923076923076924, 0.8996138996138996]
Avg recall : 0.8791535491535492
f1 of each fold - [0.8897278308545915, 0.8806756790341153, 0.8342918518334445, 0.8919819004524888, 0.9005200216153563]
Avg f1 : 0.8794394567579993
Wall time: 1.56 s


In [33]:
%%time
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
kfold_test(gb_model, 'GradientBoosting', feature_type, target_type, len(res))

precision of each fold - [0.9068052668052669, 0.862994993172508, 0.8494414414414414, 0.8914060298675683, 0.879350145805842]
Avg precision : 0.8779995754185252
recall of each fold - [0.9076923076923077, 0.8615384615384616, 0.85, 0.8923076923076924, 0.8803088803088803]
Avg recall : 0.8783694683694684
f1 of each fold - [0.9069979201126743, 0.8595848595848598, 0.8497065632907984, 0.8908910765857874, 0.8796966495176214]
Avg f1 : 0.8773754138183483
Wall time: 18.4 s


In [34]:
import os
os.remove("res.xlsx") 

res.to_excel('res.xlsx', index=False)