In [1]:
import re
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, recall_score, precision_score
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
feature_type = '2'

#1 -> just sentence
#2 -> all text
#3 -> all text + other columns

In [3]:
target_type = 1

#1 -> all label
#2 -> pos + neg
#3 -> pos + neg(negative+netural)

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

In [6]:
train.head()

Unnamed: 0,text,sentence,snp,phenotype,ASSOCIATION,CONFIDENCE
0,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,nicotine metabolism,neutral,zero
1,OBJECTIVE: Maternal smoking during pregnancy i...,We examined whether a common genetic variant a...,rs1051730,fetal growth characteristics,positive,weak
2,OBJECTIVE: Maternal smoking during pregnancy i...,RESULTS: Among mothers who did not smoke durin...,rs1051730,fetal growth characteristic,negative,-
3,OBJECTIVE: Maternal smoking during pregnancy i...,Among mothers who continued smoking during pre...,rs1051730,head circumference,negative,-
4,OBJECTIVE: Maternal smoking during pregnancy i...,The T-allele of maternal rs1051730 was associa...,rs1051730,second and third trimester fetal femur length,positive,weak


In [7]:
# dic = {"neutral": "negative"}

# train.replace({"ASSOCIATION": dic}, inplace=True)
# test.replace({"ASSOCIATION": dic}, inplace=True)

In [8]:
train['label'] = train['CONFIDENCE']
test['label']  = test['CONFIDENCE']

In [9]:
train.dropna(subset=['text', 'text'], inplace=True)
test.dropna(subset=['text', 'text'], inplace=True)

In [10]:
res = pd.read_excel('res.xlsx')

In [11]:
def vectorize_bow(df_X, cv):
    bow_X = cv.transform(df_X)
    bow_df = pd.DataFrame(data=bow_X.toarray(),
                          columns=[cv.get_feature_names_out()])
    return bow_df

def define_and_run_model(model, X_train, X_test, y_train):
    clf = model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def show_result(y_pred, y_test):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, digits=3))
    print(accuracy_score(y_test, y_pred))
    
def save_res(model_name, feature_type, target_type, y_pred_sr, y_test_sr, idx):
    res_pre = precision_score(y_test_sr, y_pred_sr, average='weighted')
    res_rec = recall_score(y_test_sr, y_pred_sr, average='weighted')
    res_f1 = f1_score(y_test_sr, y_pred_sr, average='weighted')
    res.at[idx, 'model-name'] = model_name
    res.at[idx, 'feature-type'] = feature_type
    res.at[idx, 'target-type'] = target_type
    res.at[idx, 'accuracy'] = accuracy_score(y_test_sr, y_pred_sr)
    res.at[idx, 'precision'] = res_pre
    res.at[idx, 'recall'] = res_rec
    res.at[idx, 'f1-score'] = res_f1

In [12]:
possible_labels = train.label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

train['label'] = train.label.replace(label_dict)
test['label']  = test.label.replace(label_dict)

In [13]:
df = pd.concat([train, test])

In [14]:
bow_vectorizer = CountVectorizer(lowercase=False, ngram_range=(1, 1), max_df=.80, min_df=1)
bow_vectorizer.fit(df['text'])

CountVectorizer(lowercase=False, max_df=0.8)

In [15]:
train_bow_df  = vectorize_bow(list(train['text']), bow_vectorizer)
test_bow_df   = vectorize_bow(list(test['text']), bow_vectorizer)

In [16]:
X_train = train_bow_df
y_train = train['label']

X_test = test_bow_df
y_test = test['label']

In [17]:
X_train.shape

(934, 7335)

In [18]:
y_train.shape

(934,)

In [19]:
X_test.shape

(365, 7335)

In [20]:
y_test.shape

(365,)

In [21]:
train['ASSOCIATION'].value_counts()

positive    701
neutral     142
negative     91
Name: ASSOCIATION, dtype: int64

In [22]:
test['ASSOCIATION'].value_counts()

positive    170
neutral     166
negative     29
Name: ASSOCIATION, dtype: int64

In [23]:
%%time
smodel = SVC(kernel='rbf')
smodel.fit(X_train, y_train)
pred_values_svm = smodel.predict(X_test)
show_result(pred_values_svm, y_test)
save_res('svm', feature_type, target_type, pred_values_svm, y_test, len(res))

[[  0 166   0]
 [  2 168   0]
 [  0  28   1]]
              precision    recall  f1-score   support

           0      0.000     0.000     0.000       166
           1      0.464     0.988     0.632       170
           2      1.000     0.034     0.067        29

    accuracy                          0.463       365
   macro avg      0.488     0.341     0.233       365
weighted avg      0.296     0.463     0.299       365

0.46301369863013697
Wall time: 5.55 s


In [24]:
%%time
lr = LogisticRegression(random_state=0,
                        penalty='l2',
                        solver='newton-cg',
                        C=1.2,
                        max_iter=20)
lr.fit(X_train, y_train)
pred_values_lr = lr.predict(X_test)
show_result(pred_values_lr, y_test)
save_res('LogisticRegression', feature_type, target_type, pred_values_lr, y_test, len(res))

[[  4 158   4]
 [  9 156   5]
 [  0  19  10]]
              precision    recall  f1-score   support

           0      0.308     0.024     0.045       166
           1      0.468     0.918     0.620       170
           2      0.526     0.345     0.417        29

    accuracy                          0.466       365
   macro avg      0.434     0.429     0.361       365
weighted avg      0.400     0.466     0.342       365

0.4657534246575342
Wall time: 3.94 s


In [25]:
%%time
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
pred_values_rf = model.predict(X_test)
show_result(pred_values_rf, y_test)
save_res('RandomForest', feature_type, target_type, pred_values_rf, y_test, len(res))

[[  0 162   4]
 [  2 168   0]
 [  0  28   1]]
              precision    recall  f1-score   support

           0      0.000     0.000     0.000       166
           1      0.469     0.988     0.636       170
           2      0.200     0.034     0.059        29

    accuracy                          0.463       365
   macro avg      0.223     0.341     0.232       365
weighted avg      0.234     0.463     0.301       365

0.46301369863013697
Wall time: 4.62 s


In [26]:
%%time
model = KNeighborsClassifier(n_neighbors=2)
model.fit(X_train, y_train)
pred_values_knn = model.predict(X_test)
show_result(pred_values_knn, y_test)
save_res('knn', feature_type, target_type, pred_values_knn, y_test, len(res))

[[ 29 133   4]
 [ 19 149   2]
 [  0  28   1]]
              precision    recall  f1-score   support

           0      0.604     0.175     0.271       166
           1      0.481     0.876     0.621       170
           2      0.143     0.034     0.056        29

    accuracy                          0.490       365
   macro avg      0.409     0.362     0.316       365
weighted avg      0.510     0.490     0.417       365

0.4904109589041096
Wall time: 379 ms


In [27]:
%%time
model = GaussianNB()
model.fit(X_train, y_train)
pred_values_nb = model.predict(X_test)
show_result(pred_values_nb, y_test)
save_res('GaussianNB', feature_type, target_type, pred_values_nb, y_test, len(res))

[[  0 153  13]
 [  3 167   0]
 [  0  28   1]]
              precision    recall  f1-score   support

           0      0.000     0.000     0.000       166
           1      0.480     0.982     0.645       170
           2      0.071     0.034     0.047        29

    accuracy                          0.460       365
   macro avg      0.184     0.339     0.230       365
weighted avg      0.229     0.460     0.304       365

0.4602739726027397
Wall time: 407 ms


In [28]:
%%time
model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, y_train)
pred_values_dt = model.predict(X_test)
show_result(pred_values_dt, y_test)
save_res('DecisionTree', feature_type, target_type, pred_values_dt, y_test, len(res))

[[  1 156   9]
 [ 12 157   1]
 [  2  21   6]]
              precision    recall  f1-score   support

           0      0.067     0.006     0.011       166
           1      0.470     0.924     0.623       170
           2      0.375     0.207     0.267        29

    accuracy                          0.449       365
   macro avg      0.304     0.379     0.300       365
weighted avg      0.279     0.449     0.316       365

0.44931506849315067
Wall time: 2.19 s


In [29]:
%%time
model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
model.fit(X_train, y_train)
pred_values_gb = model.predict(X_test)
show_result(pred_values_gb, y_test)
save_res('GradientBoosting', feature_type, target_type, pred_values_gb, y_test, len(res))

[[  0 159   7]
 [ 27 136   7]
 [  2  20   7]]
              precision    recall  f1-score   support

           0      0.000     0.000     0.000       166
           1      0.432     0.800     0.561       170
           2      0.333     0.241     0.280        29

    accuracy                          0.392       365
   macro avg      0.255     0.347     0.280       365
weighted avg      0.228     0.392     0.283       365

0.3917808219178082
Wall time: 41.5 s


In [30]:
import os
os.remove("res.xlsx") 

res.to_excel('res.xlsx', index=False)