In [None]:
!pip install ftfy
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from ftfy import fix_encoding
nltk.download('stopwords')
nltk.download('punkt')     # download toolkit for textblob.TextBlob.words
import re
from tqdm import tqdm
from transformers import AutoTokenizer
import spacy

from scispacy.abbreviation import AbbreviationDetector
from scispacy.umls_linking import UmlsEntityLinker
from textblob import TextBlob
import regex
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer     # tranform expanding words of words like attacker, attacked, attacking -> attack
stop_words = stopwords.words('english')

In [None]:
work_path = "./"

In [None]:
def preprocess_text(text):
    tokens = []
    text = str(text)
    text = text.strip()
    text = text.lower()
    regex = re.compile(r'<.*?>')
    text = re.sub(regex, '', text)
    text = re.sub(r"http\S+", "", text)
    regex = re.compile(r'&#.*?;')
    text = re.sub(regex, ' ', text)
    text = re.sub('([!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~])', r' \1 ', text)
    text = text.replace('\\',' \\ ')
    text = re.sub('\s{2,}', ' ', text)
    text = re.sub(r'[\r\n]+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = word_tokenize(text)
    for token in text:
            try:
                if any(i.isdigit() for i in token) == False:
                    if token not in stop_words:
                        tokens.append(token)
                else:
                    tokens.append(token)
            except Exception as e:
                print(e, token)
                pass
    tokens = " ".join(tokens)
    tokens = re.sub(r"\s's\b", "'s", tokens)
    return tokens

def preprocess_keywords(keywords):
    keywords = fix_encoding(keywords)
    keywords = keywords.split(",")
    keywords = [ele.strip() for ele in keywords if len(keywords)>3]
    keywords = re.sub(r'[\r\n]+', ' ', keywords)
    keywords = re.sub(r'[^\x00-\x7F]+', ' ', keywords)
    keywords = re.sub(r"\s's\b", "'s", keywords)
    return keywords

data_train = pd.read_csv("/kaggle/input/originnal-dataset/data_splited_train.csv", encoding = "ISO-8859-1")
data_test = pd.read_csv("/kaggle/input/originnal-dataset/data_origin_test.csv", encoding = "ISO-8859-1")
data_valid = pd.read_csv("/kaggle/input/originnal-dataset/data_splited_validate.csv", encoding = "ISO-8859-1")
data_train.dropna(subset=['title', 'abstract'], inplace=True)
data_train = data_train[data_train['abstract'].str.isnumeric()==False]
data_train = data_train[data_train['title'].str.isnumeric()==False]
data_train = data_train[data_train['keywords'].str.isnumeric()==False]
data_train.reset_index(drop=True, inplace=True)

In [None]:
tqdm.pandas()
data_train['title'] = data_train['title'].progress_apply(preprocess_text)
data_train['abstract'] = data_train['abstract'].progress_apply(preprocess_text)
data_train['keywords'] = data_train['keywords'].progress_apply(preprocess_keywords)
data_train['title'] = data_train['title'].progress_apply(fix_encoding)
data_train['abstract'] = data_train['abstract'].progress_apply(fix_encoding)

data_valid['title'] = data_valid['title'].progress_apply(preprocess_text)
data_valid['abstract'] = data_valid['abstract'].progress_apply(preprocess_text)
data_valid['keywords'] = data_valid['keywords'].progress_apply(preprocess_keywords)
data_valid['title'] = data_valid['title'].progress_apply(fix_encoding)
data_valid['abstract'] = data_valid['abstract'].progress_apply(fix_encoding)

data_test['title'] = data_test['title'].progress_apply(preprocess_text)
data_test['abstract'] = data_test['abstract'].progress_apply(preprocess_text)
data_test['keywords'] = data_test['keywords'].progress_apply(preprocess_keywords)
data_test['title'] = data_test['title'].progress_apply(fix_encoding)
data_test['abstract'] = data_test['abstract'].progress_apply(fix_encoding)

In [None]:
data_train.to_csv(work_path+"news_preprocessed_train.csv", index=False)
data_valid.to_csv(work_path+"news_preprocessed_valid.csv", index=False)
data_test.to_csv(work_path+"news_preprocessed_test.csv", index=False)