Skip to content
This repository has been archived by the owner on Jan 25, 2023. It is now read-only.

Commit

Permalink
improve testing coverage
Browse files Browse the repository at this point in the history
  • Loading branch information
andrea-sottana-MoJ committed Jun 12, 2020
1 parent 2237be6 commit b39bbc9
Showing 1 changed file with 10 additions and 0 deletions.
10 changes: 10 additions & 0 deletions tests/test_scraper.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import logging
import numpy as np
import pandas as pd
from pdf2embeddings.scraper import DocumentScraper
Expand All @@ -12,6 +13,15 @@ def test_class_instantiation(self, text_cleaning_json):
assert scraper.pdf_folder == os.getenv("FIXTURES_DIR")
assert scraper.open_json == text_cleaning_json

def test_class_instantiation_when_no_text_cleaning_json_provided(self, caplog):
with caplog.at_level(logging.WARNING):
scraper = DocumentScraper(os.getenv("FIXTURES_DIR"))
assert scraper.pdf_folder == os.getenv("FIXTURES_DIR")
expected_log_message = \
'No .json file for text cleaning was provided. Ad-hoc text cleaning will not be performed.'
assert expected_log_message in caplog.text
assert scraper.open_json == dict()

def test_document_corpus_to_pandas_df(self):
expected_scraped_df = pd.DataFrame(
{'test_pdf_1': ['Mr Michael went to the store to buy some eggs. Joel rolled down the street on his '
Expand Down

0 comments on commit b39bbc9

Please sign in to comment.