In [1]:
import re
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, recall_score, precision_score
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from numpy import mean, std

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
feature_type = '1'

#1 -> just sentence
#2 -> all text

In [4]:
target_type = 1
#1 -> pos

In [5]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

In [6]:
train.head()

Unnamed: 0,text,sentence,snp,phenotype,ASSOCIATION,CONFIDENCE
0,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,nicotine metabolism,neutral,zero
1,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,fetal growth characteristics,positive,weak
2,OBJECTIVE: Maternal smoking during pregnancy i...,RESULTS: Among mothers who did not smoke durin...,rs1051730,fetal growth characteristic,negative,-
3,OBJECTIVE: Maternal smoking during pregnancy i...,Among mothers who continued smoking during pre...,rs1051730,head circumference,negative,-
4,OBJECTIVE: Maternal smoking during pregnancy i...,The T-allele of maternal rs1051730 was associa...,rs1051730,second and third trimester fetal femur length,positive,weak


In [7]:
train['label'] = train['ASSOCIATION']
test['label']  = test['ASSOCIATION']

In [8]:
train.dropna(subset=['sentence'], inplace=True)
test.dropna(subset=['sentence'], inplace=True)

In [9]:
res = pd.read_excel('res.xlsx')

In [10]:
def vectorize_bow(df_X, cv):
    bow_X = cv.transform(df_X)
    bow_df = pd.DataFrame(data=bow_X.toarray(),
                          columns=[cv.get_feature_names_out()])
    return bow_df

def define_and_run_model(model, X_train, X_test, y_train):
    clf = model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def show_result(y_pred, y_test):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, digits=3))
    print(accuracy_score(y_test, y_pred))
    
def save_res(model_name, feature_type, target_type, y_pred_sr, y_test_sr, idx):
    res_pre = precision_score(y_test_sr, y_pred_sr, average='weighted')
    res_rec = recall_score(y_test_sr, y_pred_sr, average='weighted')
    res_f1 = f1_score(y_test_sr, y_pred_sr, average='weighted')
    res.at[idx, 'model-name'] = model_name
    res.at[idx, 'feature-type'] = feature_type
    res.at[idx, 'target-type'] = target_type
    res.at[idx, 'accuracy'] = accuracy_score(y_test_sr, y_pred_sr)
    res.at[idx, 'precision'] = res_pre
    res.at[idx, 'recall'] = res_rec
    res.at[idx, 'f1-score'] = res_f1

In [11]:
possible_labels = train.label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

train['label'] = train.label.replace(label_dict)
test['label']  = test.label.replace(label_dict)

In [12]:
df = pd.concat([train, test])

In [13]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,text,sentence,snp,phenotype,ASSOCIATION,CONFIDENCE,label
0,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,nicotine metabolism,neutral,zero,0
1,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,fetal growth characteristics,positive,weak,1
2,OBJECTIVE: Maternal smoking during pregnancy i...,RESULTS: Among mothers who did not smoke durin...,rs1051730,fetal growth characteristic,negative,-,2
3,OBJECTIVE: Maternal smoking during pregnancy i...,Among mothers who continued smoking during pre...,rs1051730,head circumference,negative,-,2
4,OBJECTIVE: Maternal smoking during pregnancy i...,The T-allele of maternal rs1051730 was associa...,rs1051730,second and third trimester fetal femur length,positive,weak,1
...,...,...,...,...,...,...,...
1294,Metabolic syndrome (MetS) is a common multifac...,Analysis of 4 SNPs revealed a significant diff...,rs10757274,MetS,negative,-,2
1295,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs6152,endometrial cancer,positive,weak,1
1296,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs1204038,endometrial cancer,positive,weak,1
1297,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs1337082,endometrial cancer,positive,weak,1


In [14]:
bow_vectorizer = CountVectorizer(lowercase=False, 
                                 ngram_range=(1, 1),
                                 max_df=.80,
                                 min_df=1)
bow_vectorizer.fit(df['sentence'])

In [15]:
train_bow_df  = vectorize_bow(list(train['sentence']), bow_vectorizer)
test_bow_df   = vectorize_bow(list(test['sentence']), bow_vectorizer)

In [16]:
X_train = train_bow_df
y_train = train['label']

In [17]:
X_test = test_bow_df
y_test = test['label']

In [18]:
xx = SVC(kernel='rbf')
xxx = define_and_run_model(xx,  X_train, X_test, y_train)
show_result(xxx, y_test)

[[  0 160   6]
 [  0 170   0]
 [  0  27   2]]
              precision    recall  f1-score   support

           0      0.000     0.000     0.000       166
           1      0.476     1.000     0.645       170
           2      0.250     0.069     0.108        29

    accuracy                          0.471       365
   macro avg      0.242     0.356     0.251       365
weighted avg      0.242     0.471     0.309       365

0.4712328767123288


In [19]:
X_train.shape

(934, 2902)

In [20]:
y_train.shape

(934,)

In [21]:
X_test.shape

(365, 2902)

In [22]:
y_test.shape

(365,)

In [23]:
test['label'].value_counts()

1    170
0    166
2     29
Name: label, dtype: int64

In [24]:
X = vectorize_bow(list(df['sentence']), bow_vectorizer)
y = df['label']

In [25]:
k = 5
kf = KFold(n_splits=k, random_state=1, shuffle=True)

In [26]:
def kfold_test(model, model_name, feature_type, target_type, idx):
    pre_scores = []
    rec_scores = []
    f1_scores  = []

    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]

        model.fit(X_train,y_train)
        pred_values = model.predict(X_test)

        res_pre = precision_score(y_test, pred_values, average='weighted')
        res_rec = recall_score(y_test, pred_values, average='weighted')
        res_f1 = f1_score(y_test, pred_values, average='weighted')
        
        pre_scores.append(res_pre)
        rec_scores.append(res_rec)
        f1_scores.append(res_f1)

    avg_pre_score = sum(pre_scores)/k
    avg_rec_score = sum(rec_scores)/k
    avg_f1_score  = sum(f1_scores)/k
    
    std_pre_score = std(pre_scores)
    std_rec_score = std(rec_scores)
    std_f1_score  = std(f1_scores)

    print('precision of each fold - {}'.format(pre_scores))
    print('Avg precision : {}'.format(avg_pre_score))
    
    print('recall of each fold - {}'.format(rec_scores))
    print('Avg recall : {}'.format(avg_rec_score))

    print('f1 of each fold - {}'.format(f1_scores))
    print('Avg f1 : {}'.format(avg_f1_score))
    
    res.at[idx, 'model-name'] = model_name
    res.at[idx, 'feature-type'] = feature_type
    res.at[idx, 'target-type'] = target_type
    
    res.at[idx, 'precision'] = avg_pre_score
    res.at[idx, 'precision-std'] = std_pre_score

    res.at[idx, 'recall'] = avg_rec_score
    res.at[idx, 'recall-std'] = std_rec_score

    res.at[idx, 'f1-score'] = avg_f1_score
    res.at[idx, 'f1-score-std'] = std_f1_score

In [27]:
%%time
smodel = SVC(kernel='rbf')
kfold_test(smodel, 'svm', feature_type, target_type, len(res))

precision of each fold - [0.8826867576867578, 0.8661083540115798, 0.8539255189255188, 0.884989523168506, 0.842160473632555]
Avg precision : 0.8659741254849834
recall of each fold - [0.8846153846153846, 0.8461538461538461, 0.8538461538461538, 0.8846153846153846, 0.8532818532818532]
Avg recall : 0.8645025245025245
f1 of each fold - [0.8734002807982543, 0.8292292813569411, 0.8422148172504643, 0.877699796178057, 0.8419727460585182]
Avg f1 : 0.852903384328447
Wall time: 10.3 s


In [28]:
%%time
lr = LogisticRegression(random_state=0,
                        penalty='l2',
                        solver='newton-cg',
                        C=1.2,
                        max_iter=20)
kfold_test(lr, 'LogisticRegression', feature_type, target_type, len(res))

precision of each fold - [0.9124646330680812, 0.8468074111079867, 0.8546107099328546, 0.8709817737511107, 0.8862606165127174]
Avg precision : 0.8742250288745501
recall of each fold - [0.9153846153846154, 0.85, 0.8576923076923076, 0.8730769230769231, 0.888030888030888]
Avg recall : 0.8768369468369468
f1 of each fold - [0.9103220213407881, 0.8429077314240238, 0.8548702181991307, 0.8712973690359991, 0.8867690079094005]
Avg f1 : 0.8732332695818684
Wall time: 3.86 s


In [29]:
%%time
rf_model = RandomForestClassifier(n_estimators=100)
kfold_test(rf_model, 'RandomForest', feature_type, target_type, len(res))

precision of each fold - [0.8999608126846326, 0.8439710578438903, 0.8654504392850257, 0.878628912071535, 0.853787026837454]
Avg precision : 0.8683596497445075
recall of each fold - [0.9038461538461539, 0.8461538461538461, 0.8692307692307693, 0.8807692307692307, 0.8648648648648649]
Avg recall : 0.8729729729729729
f1 of each fold - [0.897066872192664, 0.837807637126145, 0.8651039687129911, 0.8782746296899326, 0.8562394031931989]
Avg f1 : 0.8668985021829864
Wall time: 9.61 s


In [30]:
%%time
knn_model = KNeighborsClassifier(n_neighbors=2)
kfold_test(knn_model, 'knn', feature_type, target_type, len(res))

precision of each fold - [0.7829458476846537, 0.8236061780217414, 0.8147989066770793, 0.7649389399471234, 0.8377442672037054]
Avg precision : 0.8048068279068605
recall of each fold - [0.7923076923076923, 0.8269230769230769, 0.823076923076923, 0.7730769230769231, 0.8455598455598455]
Avg recall : 0.8121888921888922
f1 of each fold - [0.7799178270647481, 0.8194358841812964, 0.8165778878953451, 0.7572555358071451, 0.8378919850829962]
Avg f1 : 0.8022158240063062
Wall time: 845 ms


In [31]:
%%time
nb_model = GaussianNB()
kfold_test(nb_model, 'GaussianNB', feature_type, target_type, len(res))

precision of each fold - [0.8303024850768385, 0.7918669871794871, 0.8328221226319525, 0.8431219479017887, 0.814914155105404]
Avg precision : 0.822605539579094
recall of each fold - [0.8038461538461539, 0.7846153846153846, 0.8, 0.8269230769230769, 0.7799227799227799]
Avg recall : 0.7990614790614791
f1 of each fold - [0.8108735377470079, 0.7857007964830058, 0.8091406400616927, 0.8306423800080517, 0.7910507357315869]
Avg f1 : 0.805481618006269
Wall time: 990 ms


In [32]:
%%time
dt_model = DecisionTreeClassifier(random_state=0)
kfold_test(dt_model, 'DecisionTree', feature_type, target_type, len(res))

precision of each fold - [0.875219652142729, 0.8356994990763996, 0.867071710340941, 0.8615488677988679, 0.8771777280605532]
Avg precision : 0.8633434914838981
recall of each fold - [0.8769230769230769, 0.8384615384615385, 0.8615384615384616, 0.8615384615384616, 0.8725868725868726]
Avg recall : 0.8622096822096822
f1 of each fold - [0.8751682692307692, 0.8328387662724033, 0.8636404621020006, 0.8601640695370204, 0.874445141778172]
Avg f1 : 0.861251341784073
Wall time: 2.75 s


In [33]:
%%time
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
kfold_test(gb_model, 'GradientBoosting', feature_type, target_type, len(res))

precision of each fold - [0.8750895688713487, 0.8290453180096644, 0.8616046994480174, 0.8830186480186479, 0.899809053869664]
Avg precision : 0.8697134576434685
recall of each fold - [0.8769230769230769, 0.8307692307692308, 0.8615384615384616, 0.8846153846153846, 0.8918918918918919]
Avg recall : 0.8691476091476092
f1 of each fold - [0.8757735558033621, 0.8263184140880558, 0.8612037320125555, 0.8832909296609289, 0.8947009047813796]
Avg f1 : 0.8682575072692564
Wall time: 1min 4s


In [34]:
import os
os.remove("res.xlsx") 

res.to_excel('res.xlsx', index=False)