In [1]:
%%capture
!pip install nlpretext
!pip install keytotext --upgrade
!sudo apt-get install git-lfs
!pip install keybert

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
from pathlib import Path
import pandas as pd
import zipfile
import os
from tqdm.notebook import tqdm
tqdm.pandas()

from nlpretext import Preprocessor
from nlpretext.social.preprocess import remove_mentions, remove_hashtag, remove_emoji
from nlpretext.basic.preprocess import (normalize_whitespace, 
                                        remove_eol_characters, 
                                        replace_urls, 
                                        replace_emails, 
                                        remove_punct, 
                                        remove_accents,
                                        fix_bad_unicode)

# from keybert import KeyBERT
import torch
from keytotext import trainer, pipeline

INFO:lightning_fabric.utilities.seed:Global seed set to 42


In [3]:
class config:
    BASE_DIR = Path('/content/drive/MyDrive/KeyLab_shared')
    DATA_DIR = BASE_DIR / 'data'
    MODEL_DIR = BASE_DIR / 'Models'

    CHECK_POINT = 't5-base'
    STYLE = 'LOTR'
    TEST_SIZE=0.1
    SEED = 200
    BATCH_SIZE = 2
    EPOCHS = 3
    GPU = True

In [4]:
pipeline(config.MODEL_DIR / 'keytotext')

KeyError: ignored

In [None]:
# extract text files to LOTR folder 
Path(config.DATA_DIR / config.STYLE).mkdir(exist_ok=True, parents=True)
with zipfile.ZipFile(config.DATA_DIR / 'books.zip', 'r') as zip_ref:
    zip_ref.extractall(f'{config.DATA_DIR / config.STYLE}')

In [None]:
def get_text(path, style, n=15):
    filename = os.path.basename(path)
    with open(path) as f:
        lines = f.readlines()
    lines = [line.strip() for line in lines]

    phrases = []
    for i in range(0, len(lines), n):
       phrases.append(' '.join(lines[i:i+n]))

    df = pd.DataFrame()
    df['text'] = phrases
    df = df[['text']].applymap(lambda x: '.'.join(x.split('.')[1:-1])).copy()
    return df[['text']].applymap(lambda x: f'<|{style}|>' + x)

In [None]:
# read all data
df = pd.DataFrame()
for i in tqdm(range(len(os.listdir(config.DATA_DIR  / config.STYLE)))):
    path = f'{config.DATA_DIR  / config.STYLE}/' + os.listdir(config.DATA_DIR  / config.STYLE)[i]
    temp = get_text(path, config.STYLE)
    df = pd.concat([df, temp])

df.reset_index(drop=True, inplace=True)

  0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
print(df.shape)
df.sample(2)

(6332, 1)


Unnamed: 0,text
5259,<|LOTR|> `Be back soon.' 'Come back now!' shou...
2302,<|LOTR|> ‘And it would seem like wisdom but fo...


In [None]:
class DataCleaning():
    def __init__(self, df):
        self.df = df
    
    def remove_short_texts(self, min_length=10):
        return self.df[self.df[['text']].applymap(lambda x: len(x.split(' '))>min_length).values]
    
    def preprocess_text(self, text):
        preps = Preprocessor()
        preps.pipe(normalize_whitespace)
        preps.pipe(remove_eol_characters)
        preps.pipe(replace_urls, args={'replace_with': ''})
        preps.pipe(replace_emails, args={'replace_with': ''})
        preps.pipe(remove_punct, args={'marks': '-_/|\*'})
        preps.pipe(remove_accents)
        preps.pipe(fix_bad_unicode)
        preps.pipe(remove_mentions)
        preps.pipe(remove_hashtag)
        preps.pipe(remove_emoji)
        text = text[:20] + preps.run(text[20:]) if text[20:][0] != ' ' else text[:20] + ' ' + preps.run(text[20:])
        return text

    def preprocess_df(self):
        temp = self.remove_short_texts().copy()
        temp['text'] = temp['text'].progress_apply(self.preprocess_text).copy()
        return temp

In [None]:
%%time
dc = DataCleaning(df)
df = dc.preprocess_df()

  0%|          | 0/6081 [00:00<?, ?it/s]

CPU times: user 44.6 s, sys: 329 ms, total: 44.9 s
Wall time: 45.1 s


In [None]:
kbert = KeyBERT()
def extract_keywords(text):
    kws = []
    for i, sent in enumerate(text.split('.')):
        if i != 0:
            result = kbert.extract_keywords(sent, 
                                            keyphrase_ngram_range=(1, 4), 
                                            use_mmr=True,
                                            diversity=0.2, 
                                            top_n=1)
            try:
                kws.append(result[0][0])
            except:
                pass

    return ' '.join(kws)

In [None]:
df['keywords'] = df['text'].apply(extract_keywords)

In [None]:
test_df = df.sample(frac=config.TEST_SIZE, random_state=config.SEED)
train_df = df.drop(test_df.index)
print('Train Size:', train_df.shape)
print('Test Size:', test_df.shape)

Train Size: (5473, 2)
Test Size: (608, 2)


In [None]:
torch.cuda.empty_cache()
model = trainer()
model.from_pretrained(model_name=config.CHECK_POINT)
model.train(train_df=train_df, 
            test_df=test_df, 
            batch_size=config.BATCH_SIZE, 
            max_epochs=config.EPOCHS,
            use_gpu=config.GPU)
model.save_model(config.MODEL_DIR / 'keytotext')

Validating: 0it [00:00, ?it/s]

In [8]:
x=['sad', 'hero', 'world', 'time']

In [9]:
model.predict(x, use_gpu=False)

"|LOTR|> ‘It is sad,' said Aragorn. 'But I am not the hero of the world in this time, and I do not wish to be forgotten"

In [6]:
model = trainer()

In [7]:
model.from_pretrained(config.MODEL_DIR / 'keytotext')