In [1]:
import re
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, recall_score, precision_score
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from numpy import mean, std


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
feature_type = '2'

#1 -> just sentence
#2 -> all text

In [4]:
target_type = 1
#1 -> pos

In [5]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

In [6]:
train.head()

Unnamed: 0,text,sentence,snp,phenotype,ASSOCIATION,CONFIDENCE
0,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,nicotine metabolism,neutral,zero
1,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,fetal growth characteristics,positive,weak
2,OBJECTIVE: Maternal smoking during pregnancy i...,RESULTS: Among mothers who did not smoke durin...,rs1051730,fetal growth characteristic,negative,-
3,OBJECTIVE: Maternal smoking during pregnancy i...,Among mothers who continued smoking during pre...,rs1051730,head circumference,negative,-
4,OBJECTIVE: Maternal smoking during pregnancy i...,The T-allele of maternal rs1051730 was associa...,rs1051730,second and third trimester fetal femur length,positive,weak


In [7]:
train['label'] = train['CONFIDENCE']
test['label']  = test['CONFIDENCE']

In [8]:
train.dropna(subset=['text'], inplace=True)
test.dropna(subset=['text'], inplace=True)

In [9]:
train = train[train.ASSOCIATION == 'positive']
test  =  test[test.ASSOCIATION  == 'positive']

In [10]:
res = pd.read_excel('res.xlsx')

In [11]:
def vectorize_bow(df_X, cv):
    bow_X = cv.transform(df_X)
    bow_df = pd.DataFrame(data=bow_X.toarray(),
                          columns=[cv.get_feature_names_out()])
    return bow_df

def define_and_run_model(model, X_train, X_test, y_train):
    clf = model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def show_result(y_pred, y_test):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, digits=3))
    print(accuracy_score(y_test, y_pred))
    
def save_res(model_name, feature_type, target_type, y_pred_sr, y_test_sr, idx):
    res_pre = precision_score(y_test_sr, y_pred_sr, average='weighted')
    res_rec = recall_score(y_test_sr, y_pred_sr, average='weighted')
    res_f1 = f1_score(y_test_sr, y_pred_sr, average='weighted')
    res.at[idx, 'model-name'] = model_name
    res.at[idx, 'feature-type'] = feature_type
    res.at[idx, 'target-type'] = target_type
    res.at[idx, 'accuracy'] = accuracy_score(y_test_sr, y_pred_sr)
    res.at[idx, 'precision'] = res_pre
    res.at[idx, 'recall'] = res_rec
    res.at[idx, 'f1-score'] = res_f1

In [12]:
possible_labels = train.label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

train['label'] = train.label.replace(label_dict)
test['label']  = test.label.replace(label_dict)

In [13]:
df = pd.concat([train, test])

In [14]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,text,sentence,snp,phenotype,ASSOCIATION,CONFIDENCE,label
0,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,fetal growth characteristics,positive,weak,0
1,OBJECTIVE: Maternal smoking during pregnancy i...,The T-allele of maternal rs1051730 was associa...,rs1051730,second and third trimester fetal femur length,positive,weak,0
2,OBJECTIVE: Maternal smoking during pregnancy i...,The T-allele of maternal rs1051730 was associa...,rs1051730,smaller birth length,positive,weak,0
3,OBJECTIVE: Maternal smoking during pregnancy i...,The maternal T-allele of rs1051730 was associ...,rs1051730,lower third trimester estimated fetal weight [...,positive,weak,0
4,OBJECTIVE: Maternal smoking during pregnancy i...,The maternal T-allele of rs1051730 was associ...,rs1051730,birth weight,positive,weak,0
...,...,...,...,...,...,...,...
867,Metabolic syndrome (MetS) is a common multifac...,MetS and control allelic frequencies for rs238...,rs1333049,MetS,positive,moderate,2
868,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs6152,endometrial cancer,positive,weak,0
869,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs1204038,endometrial cancer,positive,weak,0
870,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs1337082,endometrial cancer,positive,weak,0


In [15]:
bow_vectorizer = CountVectorizer(lowercase=False, 
                                 ngram_range=(1, 1),
                                 max_df=.80,
                                 min_df=1)
bow_vectorizer.fit(df['text'])

In [16]:
train_bow_df  = vectorize_bow(list(train['text']), bow_vectorizer)
test_bow_df   = vectorize_bow(list(test['text']), bow_vectorizer)

In [17]:
X_train = train_bow_df
y_train = train['label']

In [18]:
X_test = test_bow_df
y_test = test['label']

In [19]:
X_train.shape

(702, 6622)

In [20]:
y_train.shape

(702,)

In [21]:
X_test.shape

(170, 6622)

In [22]:
y_test.shape

(170,)

In [23]:
test['label'].value_counts()

0    140
2     16
1     14
Name: label, dtype: int64

In [24]:
X = vectorize_bow(list(df['text']), bow_vectorizer)
y = df['label']

In [25]:
k = 5
kf = KFold(n_splits=k, random_state=1, shuffle=True)

In [26]:
def kfold_test(model, model_name, feature_type, target_type, idx):
    pre_scores = []
    rec_scores = []
    f1_scores  = []

    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]

        model.fit(X_train,y_train)
        pred_values = model.predict(X_test)

        res_pre = precision_score(y_test, pred_values, average='weighted')
        res_rec = recall_score(y_test, pred_values, average='weighted')
        res_f1 = f1_score(y_test, pred_values, average='weighted')
        
        pre_scores.append(res_pre)
        rec_scores.append(res_rec)
        f1_scores.append(res_f1)

    avg_pre_score = sum(pre_scores)/k
    avg_rec_score = sum(rec_scores)/k
    avg_f1_score  = sum(f1_scores)/k
    
    std_pre_score = std(pre_scores)
    std_rec_score = std(rec_scores)
    std_f1_score  = std(f1_scores)

    print('precision of each fold - {}'.format(pre_scores))
    print('Avg precision : {}'.format(avg_pre_score))
    
    print('recall of each fold - {}'.format(rec_scores))
    print('Avg recall : {}'.format(avg_rec_score))

    print('f1 of each fold - {}'.format(f1_scores))
    print('Avg f1 : {}'.format(avg_f1_score))
    
    res.at[idx, 'model-name'] = model_name
    res.at[idx, 'feature-type'] = feature_type
    res.at[idx, 'target-type'] = target_type
    
    res.at[idx, 'precision'] = avg_pre_score
    res.at[idx, 'precision-std'] = std_pre_score

    res.at[idx, 'recall'] = avg_rec_score
    res.at[idx, 'recall-std'] = std_rec_score

    res.at[idx, 'f1-score'] = avg_f1_score
    res.at[idx, 'f1-score-std'] = std_f1_score

In [27]:
%%time
smodel = SVC(kernel='rbf')
kfold_test(smodel, 'svm', feature_type, target_type, len(res))

precision of each fold - [0.7988278388278388, 0.7686082457084168, 0.7422355741321258, 0.7553868455843, 0.766742496807152]
Avg precision : 0.7663602002119667
recall of each fold - [0.7542857142857143, 0.7714285714285715, 0.7126436781609196, 0.7528735632183908, 0.7701149425287356]
Avg recall : 0.7522692939244663
f1 of each fold - [0.7301557378965332, 0.7557608499501267, 0.681025339796308, 0.7278551933924647, 0.7571658754774093]
Avg f1 : 0.7303925993025684
Wall time: 13.6 s


In [28]:
%%time
lr = LogisticRegression(random_state=0,
                        penalty='l2',
                        solver='newton-cg',
                        C=1.2,
                        max_iter=20)
kfold_test(lr, 'LogisticRegression', feature_type, target_type, len(res))

precision of each fold - [0.7654639724626617, 0.8081634801288937, 0.7684248784698559, 0.7389854442264097, 0.7882978070377362]
Avg precision : 0.7738671164651114
recall of each fold - [0.7714285714285715, 0.8114285714285714, 0.764367816091954, 0.7528735632183908, 0.7873563218390804]
Avg recall : 0.7774909688013135
f1 of each fold - [0.7623030303030304, 0.808795241455792, 0.751164829301293, 0.7385065750201364, 0.7874525027028336]
Avg f1 : 0.7696444357566171
Wall time: 11.9 s


In [29]:
%%time
rf_model = RandomForestClassifier(n_estimators=100)
kfold_test(rf_model, 'RandomForest', feature_type, target_type, len(res))

precision of each fold - [0.7727421635320795, 0.8154682485655936, 0.7455027567517054, 0.7232144611454957, 0.8083945475928818]
Avg precision : 0.7730644355175512
recall of each fold - [0.7657142857142857, 0.8171428571428572, 0.735632183908046, 0.735632183908046, 0.7988505747126436]
Avg recall : 0.7705944170771757
f1 of each fold - [0.7519635673886649, 0.814628307851363, 0.7189947749636605, 0.7157979030198163, 0.7986933849415389]
Avg f1 : 0.7600155876330088
Wall time: 11.4 s


In [30]:
%%time
knn_model = KNeighborsClassifier(n_neighbors=2)
kfold_test(knn_model, 'knn', feature_type, target_type, len(res))

precision of each fold - [0.7462254148801815, 0.8181684981684982, 0.7113964258989832, 0.7116767947993216, 0.7302854513956281]
Avg precision : 0.7435505170285225
recall of each fold - [0.7428571428571429, 0.8171428571428572, 0.7126436781609196, 0.7241379310344828, 0.7298850574712644]
Avg recall : 0.7453333333333334
f1 of each fold - [0.730923627574025, 0.8037446270543616, 0.6929496110530594, 0.7159738996692365, 0.7211075837992829]
Avg f1 : 0.7329398698299932
Wall time: 1.07 s


In [31]:
%%time
nb_model = GaussianNB()
kfold_test(nb_model, 'GaussianNB', feature_type, target_type, len(res))

precision of each fold - [0.7058040894876905, 0.7858906024557273, 0.6884823576583801, 0.7200674877625876, 0.7294074689885433]
Avg precision : 0.7259304012705857
recall of each fold - [0.6514285714285715, 0.7371428571428571, 0.6666666666666666, 0.6781609195402298, 0.6551724137931034]
Avg recall : 0.6777142857142857
f1 of each fold - [0.6714567378681535, 0.7500737978584573, 0.6743278114766799, 0.6912245800176834, 0.6708366897328143]
Avg f1 : 0.6915839233907577
Wall time: 1.42 s


In [32]:
%%time
dt_model = DecisionTreeClassifier(random_state=0)
kfold_test(dt_model, 'DecisionTree', feature_type, target_type, len(res))

precision of each fold - [0.7850684985966334, 0.8274489795918368, 0.7691199043335475, 0.7452359900635762, 0.8075428833898285]
Avg precision : 0.7868832511950845
recall of each fold - [0.7942857142857143, 0.8285714285714286, 0.7701149425287356, 0.7586206896551724, 0.7988505747126436]
Avg recall : 0.7900886699507389
f1 of each fold - [0.7752263110327625, 0.8265026716402862, 0.758068188128158, 0.7436688435110791, 0.8009677808121646]
Avg f1 : 0.7808867590248901
Wall time: 3.5 s


In [33]:
%%time
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
kfold_test(gb_model, 'GradientBoosting', feature_type, target_type, len(res))

precision of each fold - [0.759247561093403, 0.8328706248379569, 0.7240591336353053, 0.7311546106645924, 0.8121544742234397]
Avg precision : 0.7718972808909395
recall of each fold - [0.76, 0.8342857142857143, 0.7298850574712644, 0.7471264367816092, 0.8045977011494253]
Avg recall : 0.7751789819376027
f1 of each fold - [0.7479772928493307, 0.8290242674123903, 0.712313766651084, 0.7281316011341241, 0.8057455689499424]
Avg f1 : 0.7646384993993743
Wall time: 1min 29s


In [35]:
import os
os.remove("res.xlsx") 

res.to_excel('res.xlsx', index=False)