In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# Step 1: Load and Inspect the Data
file_path = 'rct_data.txt'
data = pd.read_csv(file_path, delimiter='\t', header=None, names=['ID', 'Label', 'Year', 'Title', 'Abstract'])

In [6]:
data.head()

Unnamed: 0,ID,Label,Year,Title,Abstract
0,18439781,0,2011,Two patients subdued with a TASER® device: cas...,"In the United States, an increasing number of ..."
1,18468833,0,2011,A case of Takayasu arteritis causing subclavia...,The American Heart Association website defines...
2,18481181,0,2012,Pathophysiology of hypopituitarism in the sett...,The complex pathophysiology of traumatic brain...
3,18728056,1,2011,"The cardiovascular risk factor, soluble CD40 l...",[BACKGROUND] Soluble CD40 ligand (sCD40L) is a...
4,18790590,0,2011,Horner syndrome due to carotid dissection.,[BACKGROUND] Internal carotid artery dissectio...


In [None]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Step 2: Preprocess the Data
data.dropna(subset=['Abstract'], inplace=True)

def preprocess_text(comment):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Lowercase
    comment = comment.lower()
    # Remove special characters and URLs
    comment = re.sub(r"http\S+|www\S+|https\S+", '', comment, flags=re.MULTILINE)
    comment = re.sub(r'\@\w+|\#','', comment)
    # Tokenization
    tokens = word_tokenize(comment)
    # Lemmatization and stop word removal
    comment = ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])
    return comment

data['Abstract'] = data['Abstract'].apply(preprocess_text)
X = data['Abstract']
y = data['Label']

In [None]:
# Step 3: Split the Data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# Step 4: Vectorize the Text Data using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
# Step 5: Train and Evaluate Multiple Models

# Model 1: Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_tfidf, y_train)
val_predictions_log_reg = log_reg.predict(X_val_tfidf)
print("Logistic Regression Validation Accuracy:", accuracy_score(y_val, val_predictions_log_reg))
print("Logistic Regression Validation Report:")
print(classification_report(y_val, val_predictions_log_reg))

Logistic Regression Validation Accuracy: 0.9342583732057417
Logistic Regression Validation Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96     33071
           1       0.89      0.78      0.83      8729

    accuracy                           0.93     41800
   macro avg       0.92      0.88      0.90     41800
weighted avg       0.93      0.93      0.93     41800



In [None]:
# Model 2: Support Vector Machine (SVM)
svm = SVC()
svm.fit(X_train_tfidf, y_train)
val_predictions_svm = svm.predict(X_val_tfidf)
print("SVM Validation Accuracy:", accuracy_score(y_val, val_predictions_svm))
print("SVM Validation Report:")
print(classification_report(y_val, val_predictions_svm))

In [None]:
# Model 3: Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)
val_predictions_rf = rf.predict(X_val_tfidf)
print("Random Forest Validation Accuracy:", accuracy_score(y_val, val_predictions_rf))
print("Random Forest Validation Report:")
print(classification_report(y_val, val_predictions_rf))

Random Forest Validation Accuracy: 0.8897208985704561
Random Forest Validation Report:
              precision    recall  f1-score   support

           0       0.87      0.99      0.93      1087
           1       0.97      0.59      0.74       382

    accuracy                           0.89      1469
   macro avg       0.92      0.79      0.83      1469
weighted avg       0.90      0.89      0.88      1469



In [None]:
# Step 6: Compare the Models on the Test Set

# Logistic Regression Test Evaluation
test_predictions_log_reg = log_reg.predict(X_test_tfidf)
print("Logistic Regression Test Accuracy:", accuracy_score(y_test, test_predictions_log_reg))
print("Logistic Regression Test Report:")
print(classification_report(y_test, test_predictions_log_reg))

Logistic Regression Test Accuracy: 0.8924438393464942
Logistic Regression Test Report:
              precision    recall  f1-score   support

           0       0.89      0.98      0.93      1126
           1       0.90      0.61      0.72       343

    accuracy                           0.89      1469
   macro avg       0.90      0.79      0.83      1469
weighted avg       0.89      0.89      0.88      1469



In [None]:
# SVM Test Evaluation
test_predictions_svm = svm.predict(X_test_tfidf)
print("SVM Test Accuracy:", accuracy_score(y_test, test_predictions_svm))
print("SVM Test Report:")
print(classification_report(y_test, test_predictions_svm))

SVM Test Accuracy: 0.9033356024506467
SVM Test Report:
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      1126
           1       0.91      0.65      0.76       343

    accuracy                           0.90      1469
   macro avg       0.90      0.82      0.85      1469
weighted avg       0.90      0.90      0.90      1469



In [None]:
# Random Forest Test Evaluation
test_predictions_rf = rf.predict(X_test_tfidf)
print("Random Forest Test Accuracy:", accuracy_score(y_test, test_predictions_rf))
print("Random Forest Test Report:")
print(classification_report(y_test, test_predictions_rf))

Random Forest Test Accuracy: 0.9081007488087134
Random Forest Test Report:
              precision    recall  f1-score   support

           0       0.90      0.99      0.94      1126
           1       0.96      0.63      0.76       343

    accuracy                           0.91      1469
   macro avg       0.93      0.81      0.85      1469
weighted avg       0.91      0.91      0.90      1469

