In [1]:
import re
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, recall_score, precision_score
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from numpy import mean, std

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
feature_type = '2'

#1 -> just sentence
#2 -> all text

In [4]:
target_type = 1
#1 -> pos

In [5]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

In [6]:
train.head()

Unnamed: 0,text,sentence,snp,phenotype,ASSOCIATION,CONFIDENCE
0,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,nicotine metabolism,neutral,zero
1,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,fetal growth characteristics,positive,weak
2,OBJECTIVE: Maternal smoking during pregnancy i...,RESULTS: Among mothers who did not smoke durin...,rs1051730,fetal growth characteristic,negative,-
3,OBJECTIVE: Maternal smoking during pregnancy i...,Among mothers who continued smoking during pre...,rs1051730,head circumference,negative,-
4,OBJECTIVE: Maternal smoking during pregnancy i...,The T-allele of maternal rs1051730 was associa...,rs1051730,second and third trimester fetal femur length,positive,weak


In [7]:
train['label'] = train['ASSOCIATION']
test['label']  = test['ASSOCIATION']

In [8]:
# train = train[~train.ASSOCIATION.isin(['neutral'])]
# test  = test[~test.ASSOCIATION.isin(['neutral'])]

dic = {"neutral": "negative"}

train.replace({"ASSOCIATION": dic}, inplace=True)
test.replace({"ASSOCIATION": dic}, inplace=True)

In [9]:
train.dropna(subset=['sentence', 'text'], inplace=True)
test.dropna(subset=['sentence', 'text'], inplace=True)

In [10]:
res = pd.read_excel('res.xlsx')

In [11]:
def vectorize_bow(df_X, cv):
    bow_X = cv.transform(df_X)
    bow_df = pd.DataFrame(data=bow_X.toarray(),
                          columns=[cv.get_feature_names_out()])
    return bow_df

def define_and_run_model(model, X_train, X_test, y_train):
    clf = model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def show_result(y_pred, y_test):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, digits=3))
    print(accuracy_score(y_test, y_pred))
    
def save_res(model_name, feature_type, target_type, y_pred_sr, y_test_sr, idx):
    res_pre = precision_score(y_test_sr, y_pred_sr, average='weighted')
    res_rec = recall_score(y_test_sr, y_pred_sr, average='weighted')
    res_f1 = f1_score(y_test_sr, y_pred_sr, average='weighted')
    res.at[idx, 'model-name'] = model_name
    res.at[idx, 'feature-type'] = feature_type
    res.at[idx, 'target-type'] = target_type
    res.at[idx, 'accuracy'] = accuracy_score(y_test_sr, y_pred_sr)
    res.at[idx, 'precision'] = res_pre
    res.at[idx, 'recall'] = res_rec
    res.at[idx, 'f1-score'] = res_f1

In [12]:
possible_labels = train.label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

train['label'] = train.label.replace(label_dict)
test['label']  = test.label.replace(label_dict)

In [13]:
df = pd.concat([train, test])

In [14]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,text,sentence,snp,phenotype,ASSOCIATION,CONFIDENCE,label
0,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,nicotine metabolism,negative,zero,0
1,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,fetal growth characteristics,positive,weak,1
2,OBJECTIVE: Maternal smoking during pregnancy i...,RESULTS: Among mothers who did not smoke durin...,rs1051730,fetal growth characteristic,negative,-,2
3,OBJECTIVE: Maternal smoking during pregnancy i...,Among mothers who continued smoking during pre...,rs1051730,head circumference,negative,-,2
4,OBJECTIVE: Maternal smoking during pregnancy i...,The T-allele of maternal rs1051730 was associa...,rs1051730,second and third trimester fetal femur length,positive,weak,1
...,...,...,...,...,...,...,...
1294,Metabolic syndrome (MetS) is a common multifac...,Analysis of 4 SNPs revealed a significant diff...,rs10757274,MetS,negative,-,2
1295,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs6152,endometrial cancer,positive,weak,1
1296,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs1204038,endometrial cancer,positive,weak,1
1297,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs1337082,endometrial cancer,positive,weak,1


In [15]:
bow_vectorizer = CountVectorizer(lowercase=False, 
                                 ngram_range=(1, 1),
                                 max_df=.80,
                                 min_df=1)
bow_vectorizer.fit(df['text'])

In [16]:
train_bow_df  = vectorize_bow(list(train['text']), bow_vectorizer)
test_bow_df   = vectorize_bow(list(test['text']), bow_vectorizer)

In [17]:
X_train = train_bow_df
y_train = train['label']

In [18]:
X_test = test_bow_df
y_test = test['label']

In [19]:
X_train.shape

(934, 7335)

In [20]:
y_train.shape

(934,)

In [21]:
X_test.shape

(365, 7335)

In [22]:
y_test.shape

(365,)

In [23]:
test['label'].value_counts()

1    170
0    166
2     29
Name: label, dtype: int64

In [24]:
X = vectorize_bow(list(df['text']), bow_vectorizer)
y = df['label']

In [25]:
k = 5
kf = KFold(n_splits=k, random_state=1, shuffle=True)

In [26]:
def kfold_test(model, model_name, feature_type, target_type, idx):
    pre_scores = []
    rec_scores = []
    f1_scores  = []

    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]

        model.fit(X_train,y_train)
        pred_values = model.predict(X_test)

        res_pre = precision_score(y_test, pred_values, average='weighted')
        res_rec = recall_score(y_test, pred_values, average='weighted')
        res_f1 = f1_score(y_test, pred_values, average='weighted')
        
        pre_scores.append(res_pre)
        rec_scores.append(res_rec)
        f1_scores.append(res_f1)

    avg_pre_score = sum(pre_scores)/k
    avg_rec_score = sum(rec_scores)/k
    avg_f1_score  = sum(f1_scores)/k
    
    std_pre_score = std(pre_scores)
    std_rec_score = std(rec_scores)
    std_f1_score  = std(f1_scores)

    print('precision of each fold - {}'.format(pre_scores))
    print('Avg precision : {}'.format(avg_pre_score))
    
    print('recall of each fold - {}'.format(rec_scores))
    print('Avg recall : {}'.format(avg_rec_score))

    print('f1 of each fold - {}'.format(f1_scores))
    print('Avg f1 : {}'.format(avg_f1_score))
    
    res.at[idx, 'model-name'] = model_name
    res.at[idx, 'feature-type'] = feature_type
    res.at[idx, 'target-type'] = target_type
    
    res.at[idx, 'precision'] = avg_pre_score
    res.at[idx, 'precision-std'] = std_pre_score

    res.at[idx, 'recall'] = avg_rec_score
    res.at[idx, 'recall-std'] = std_rec_score

    res.at[idx, 'f1-score'] = avg_f1_score
    res.at[idx, 'f1-score-std'] = std_f1_score

In [27]:
%%time
smodel = SVC(kernel='rbf')
kfold_test(smodel, 'svm', feature_type, target_type, len(res))

precision of each fold - [0.8288782951307252, 0.7795506535947713, 0.7680551806867597, 0.8100207719110157, 0.7949341670271902]
Avg precision : 0.7962878136700924
recall of each fold - [0.8307692307692308, 0.7576923076923077, 0.7692307692307693, 0.8115384615384615, 0.7915057915057915]
Avg recall : 0.7921473121473122
f1 of each fold - [0.8253893065819671, 0.7295711970487897, 0.7318281233329778, 0.7980671528828954, 0.7845161476563346]
Avg f1 : 0.773874385500593
Wall time: 35.9 s


In [28]:
%%time
lr = LogisticRegression(random_state=0,
                        penalty='l2',
                        solver='newton-cg',
                        C=1.2,
                        max_iter=20)
kfold_test(lr, 'LogisticRegression', feature_type, target_type, len(res))

precision of each fold - [0.835283177235273, 0.7922413227775252, 0.8024121989121988, 0.8377924788924198, 0.8258709903871194]
Avg precision : 0.8187200336409072
recall of each fold - [0.823076923076923, 0.8, 0.8153846153846154, 0.8384615384615385, 0.8262548262548263]
Avg recall : 0.8206355806355807
f1 of each fold - [0.8265189295141472, 0.7859297209990143, 0.8040570246608978, 0.8357499414341896, 0.8245671286160208]
Avg f1 : 0.815364549044854
Wall time: 22.2 s


In [29]:
%%time
rf_model = RandomForestClassifier(n_estimators=100)
kfold_test(rf_model, 'RandomForest', feature_type, target_type, len(res))

precision of each fold - [0.842062484904803, 0.8012079378774806, 0.8032029537626552, 0.8434446483546675, 0.8258709903871194]
Avg precision : 0.8231578030573452
recall of each fold - [0.8346153846153846, 0.8038461538461539, 0.8192307692307692, 0.8461538461538461, 0.8262548262548263]
Avg recall : 0.826020196020196
f1 of each fold - [0.8372945796739668, 0.791944985712788, 0.8051891381769076, 0.8431259572304417, 0.8245671286160208]
Avg f1 : 0.8204243578820251
Wall time: 27.5 s


In [30]:
%%time
knn_model = KNeighborsClassifier(n_neighbors=2)
kfold_test(knn_model, 'knn', feature_type, target_type, len(res))

precision of each fold - [0.718380883299421, 0.7604782521035623, 0.762948717948718, 0.7016808786889848, 0.801849333099333]
Avg precision : 0.7490676130280038
recall of each fold - [0.7038461538461539, 0.7538461538461538, 0.7615384615384615, 0.6923076923076923, 0.7915057915057915]
Avg recall : 0.7406088506088506
f1 of each fold - [0.7095693935693934, 0.7398401024938489, 0.7521068906279824, 0.690618548151088, 0.7893357221527745]
Avg f1 : 0.7362941313990173
Wall time: 2.19 s


In [31]:
%%time
nb_model = GaussianNB()
kfold_test(nb_model, 'GaussianNB', feature_type, target_type, len(res))

precision of each fold - [0.8279952443807865, 0.7865883365138131, 0.8227153421834273, 0.830719196042579, 0.8037644127869692]
Avg precision : 0.8143565063815151
recall of each fold - [0.7346153846153847, 0.7576923076923077, 0.7307692307692307, 0.7730769230769231, 0.7258687258687259]
Avg recall : 0.7444045144045144
f1 of each fold - [0.7566172429921999, 0.7631233323655868, 0.7573806662698448, 0.7839310689310689, 0.7451860127339149]
Avg f1 : 0.7612476646585231
Wall time: 2.59 s


In [32]:
%%time
dt_model = DecisionTreeClassifier(random_state=0)
kfold_test(dt_model, 'DecisionTree', feature_type, target_type, len(res))

precision of each fold - [0.8553516237671245, 0.8059747875225919, 0.7935421721136006, 0.8095350319032419, 0.8307815204366927]
Avg precision : 0.8190370271486505
recall of each fold - [0.8384615384615385, 0.8115384615384615, 0.8076923076923077, 0.8076923076923077, 0.8262548262548263]
Avg recall : 0.8183278883278883
f1 of each fold - [0.842063598099204, 0.8010262429276513, 0.7955537032626816, 0.8074492432631966, 0.8283909826137488]
Avg f1 : 0.8148967540332965
Wall time: 9.3 s


In [33]:
%%time
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
kfold_test(gb_model, 'GradientBoosting', feature_type, target_type, len(res))

precision of each fold - [0.8365072584862926, 0.775052186790803, 0.7498210155857213, 0.7982066192331556, 0.8163774341570651]
Avg precision : 0.7951929028506075
recall of each fold - [0.8307692307692308, 0.7769230769230769, 0.7653846153846153, 0.7961538461538461, 0.7992277992277992]
Avg recall : 0.7936917136917137
f1 of each fold - [0.8330096780244177, 0.7663392137076349, 0.749969181459566, 0.7958368123921714, 0.8054679481209299]
Avg f1 : 0.790124566740944
Wall time: 3min 44s


In [34]:
import os
os.remove("res.xlsx") 

res.to_excel('res.xlsx', index=False)