# Loading all abstracts as text for further processing for models

 **This noebook can be duplicated to use as the beginning to input the relevant dataset on Cellbiology**

In [2]:
# Import required libraries

import pandas as pd
import csv

from transformers import pipeline
import numpy as np
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

2024-06-01 15:42:07.549438: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load & clean non retracted papers

**Reading the csv file with the text of non-retracted papers into a dataframe**

In [3]:
# Relative path to the CSV file for non-retracted papers
file_path = '../data/cellbio_abstracts/cellbio_abstracts_non-retracted_text.csv'

# Load the CSV file into a DataFrame, specifying the pipe '|' as the delimiter
abstract_nonretracted_df = pd.read_csv(file_path, delimiter='|')

# Display information about the DataFrame 
#abstract_nonretracted_df.info()
#abstract_nonretracted_df.head()


**Removing all rows where retract% or withdraw% appear in the abstract text**

In [4]:
# filter the dataframe to clean up rows that have the word retracted in non-retracted papers

# Filter the DataFrame to only include rows where 'ret_flag' is True

ret_flag_nonretracted_df = abstract_nonretracted_df[abstract_nonretracted_df['ret_flag'] == True]

# Info on the filtered DataFrame
#ret_flag_nonretracted_df.info()

In [5]:
# Drop rows where 'ret_flag' is True directly from the main DataFrame
cleaned_nonretracted_df = abstract_nonretracted_df[abstract_nonretracted_df['ret_flag'] != True]

# Display the cleaned DataFrame to verify the rows are dropped
#cleaned_nonretracted_df.info()
#cleaned_nonretracted_df

# Load & clean retracted papers 

**Reading the csv file with the text of retracted papaers into a dataframe**

In [6]:
# Relative path to the CSV file
file_path = '../data/cellbio_abstracts/cellbio_abstracts_retracted_text.csv'

# Load the CSV file into a DataFrame, specifying the pipe '|' as the delimiter
abstract_retracted_df = pd.read_csv(file_path, delimiter='|')

# Display information about the DataFrame 
#abstract_retracted_df.info()
#abstract_retracted_df.head()


**Removing all rows where retract% or withdraw% appear in the abstract text**

In [7]:
# filter the dataframe to clean up rows that have the word retracted :

# Filter the DataFrame to only include rows where 'ret_flag' is True
ret_flag_retracted_df = abstract_retracted_df[abstract_retracted_df['ret_flag'] == True]

# Display the filtered DataFrame
#ret_flag_retracted_df.info()

In [8]:
# Drop rows where 'ret_flag' is True directly from the main DataFrame

cleaned_retracted_df = abstract_retracted_df[abstract_retracted_df['ret_flag'] != True]

# Display the cleaned DataFrame to verify the rows are dropped
#cleaned_retracted_df.info()
#cleaned_retracted_df

# Dataframe with just abstract_text and target for both retracted and non-retracted papers


Create one Dataframe each target group

In [9]:

# Selecting required columns for retracted abstracts
retracted_subset = cleaned_retracted_df[['abstract_text', 'target']]

# Selecting required columns for non-retracted abstracts
nonretracted_subset = cleaned_nonretracted_df[['abstract_text', 'target']]

# Creating new dataframes
retracted_df = pd.DataFrame(retracted_subset)
nonretracted_df = pd.DataFrame(nonretracted_subset)

#Display dfs
#retracted_df
#nonretracted_df

# Combine both target groups into one dataframe for dataset model processing

In [10]:

# Concatenating both dataframes
abstracts_df = pd.concat([retracted_df, nonretracted_df], ignore_index=True)

# Optionally, you can shuffle the rows if you want
abstracts_df = abstracts_df.sample(frac=1).reset_index(drop=True)

#make sure target is a integer
abstracts_df['target'] = abstracts_df['target'].astype(int)


# Print the combined dataframe
abstracts_df


Unnamed: 0,abstract_text,target
0,"""Recent studies have demonstrated that the cel...",1
1,"""Pulmonary senescence and fibrosis occur with ...",1
2,"""Therapeutic effects of quercetin-loaded phyto...",1
3,"""Zinc oxide nanoparticles (ZnO NP) may be pres...",1
4,"""This study aims to elucidate the mechanisms b...",0
...,...,...
10467,"""Interleukin-1 (IL-1) is important for the pat...",1
10468,"""Abstract: This study aimed to investigate the...",0
10469,"""Telomerase reverse transcriptase (TERT) is ti...",1
10470,"""The aim of this study was to explore the effe...",0


In [11]:
print(f'There are {abstracts_df.shape[0]} rows in the dataset')

There are 10472 rows in the dataset
