In [1]:
%load_ext autoreload
%autoreload 2

import bz2
import pickle
import sys
from os import pipe
from pathlib import Path
from typing import Tuple

import numpy as np  # noqa: F401
import pandas as pd  # noqa: F401
import stanza
from stanza_batch import batch
from tqdm.notebook import tqdm 
import toma

from hnlp_proj.delta import DeltaTransformer, create_feature_matrix  # noqa: F401, F403
from hnlp_proj.loader import (
    BEN_YEHUDA_STANZA_PICKLE,  # noqa: F401, F403
    YNET_STANZA_PICKLE,
    load_ben_yehuda,
    load_debug,
    load_eng_test,
    load_ynet,
)
from hnlp_proj.processing import (
    Processing,  # noqa: F401, F403
    get_stanza_pipeline,
    process_data,
)
from hnlp_proj.utils import *  # noqa: F401, F403
from YAP_Wrapper.yap_wrapper.hebtokenizer import num

In [2]:
def load_dataset(ds_type: str) -> Tuple[pd.DataFrame, Path]:
    if ds_type == "ynet":
        return load_ynet(), YNET_STANZA_PICKLE
    if ds_type == "ben_yehuda":
        return load_ben_yehuda(), BEN_YEHUDA_STANZA_PICKLE
    raise ValueError(f"Invalid ds_type '{ds_type}'")


df, pickle_path = load_dataset("ben_yehuda")

if pickle_path.exists():
    raise ValueError(
        f"There is already a pickle file at {pickle_path}, please rename it to proceed"
    )
pickle_path.parent.mkdir(parents=True, exist_ok=True)


In [3]:
pipeline = get_stanza_pipeline(Processing.StanzaLemma, use_gpu=True)
numDocs = 0
docs = []

def tomaFun(batch_size: int, nlp: stanza.Pipeline, data: pd.Series):
    print(f"Current batch size is {batch_size}")
    for doc in batch(data, nlp, batch_size=batch_size, clear_cache=True):
        yield doc

try:
    for doc in tqdm(toma.simple.batch(tomaFun, 32, pipeline, df["text"]), desc="Processing texts via stanza", total=len(df)):
        if numDocs % 100 == 0:
            print(f"Processed {numDocs + 1} documents out of {len(df)}")
        numDocs += 1
        docs.append(doc)
except Exception as e:
    print(
        f"Got an exception after processing {len(docs)} out of {len(df)}: {e}",
        file=sys.stderr,
    )

# try:
#     for doc in tqdm(batch(list(df["text"]), pipeline, batch_size=1, clear_cache=True), desc="Processing texts via stanza", total=len(df)):
#         if numDocs % 100 == 0:
#             print(f"Processed {numDocs + 1} documents out of {len(df)}")
#         numDocs += 1
#         docs.append(doc)
# except Exception as e:
#     print(
#         f"Got an exception after processing {len(docs)} out of {len(df)}: {e}",
#         file=sys.stderr,
#     )




Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 6.03MB/s]                    
2021-06-07 23:21:12 INFO: Downloading default packages for language: he (Hebrew)...
2021-06-07 23:21:14 INFO: File exists: e:\heb_nlp\hnlp_proj\hnlp_proj\stanza_resources\he\default.zip.
2021-06-07 23:21:17 INFO: Finished downloading models and saved to e:\heb_nlp\hnlp_proj\hnlp_proj\stanza_resources.
2021-06-07 23:21:17 INFO: Loading these models for language: he (Hebrew):
| Processor | Package |
-----------------------
| tokenize  | htb     |
| mwt       | htb     |
| pos       | htb     |
| lemma     | htb     |

2021-06-07 23:21:17 INFO: Use device: gpu
2021-06-07 23:21:17 INFO: Loading: tokenize
2021-06-07 23:21:20 INFO: Loading: mwt
2021-06-07 23:21:20 INFO: Loading: pos
2021-06-07 23:21:21 INFO: Loading: lemma
2021-06-07 23:21:21 INFO: Done loading processors!


Processing texts via stanza:   0%|          | 0/13408 [00:00<?, ?it/s]

Current batch size is 32
Processed 1 documents out of 13408


In [None]:
with bz2.open(pickle_path, "wb") as pickle_f:
    pickle.dump(docs, pickle_f, protocol=4)