Skip to content

Commit

Permalink
[BUG] Upgrade dependencies and add dateparser in dependency list (#252)
Browse files Browse the repository at this point in the history
  • Loading branch information
lalitpagaria committed Jul 19, 2022
1 parent fecb1c9 commit 982ea9a
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 107 deletions.
1 change: 1 addition & 0 deletions ATTRIBUTION.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ This could not have been possible without following open source software -
- [gnews](https://github.com/ranahaani/GNews): For Google News integration
- [python-facebook-api](https://github.com/sns-sdks/python-facebook): For facebook integration
- [youtube-comment-downloader](https://github.com/egbertbouman/youtube-comment-downloader): For Youtube video comments extraction code
- [dateparser](https://github.com/scrapinghub/dateparser): To parse date properly (where format is ambiguous)
1 change: 0 additions & 1 deletion example/pandas_sink_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
ClassificationAnalyzerConfig,
ZeroShotClassificationAnalyzer,
)
from obsei.misc.utils import obj_to_json
from obsei.sink.pandas_sink import PandasSink, PandasSinkConfig
from obsei.source.playstore_scrapper import (
PlayStoreScrapperConfig,
Expand Down
29 changes: 4 additions & 25 deletions example/reddit_scrapper_example.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
import logging
import sys
import time
from datetime import datetime, timedelta

import pytz

from obsei.misc.utils import DATETIME_STRING_PATTERN
from obsei.source.reddit_scrapper import RedditScrapperConfig, RedditScrapperSource
from obsei.workflow.store import WorkflowStore
from obsei.workflow.workflow import Workflow, WorkflowConfig


def print_state(id: str):
Expand All @@ -26,26 +23,8 @@ def print_state(id: str):
lookup_period=since_time.strftime(DATETIME_STRING_PATTERN),
)

source = RedditScrapperSource(store=WorkflowStore())
source = RedditScrapperSource()

workflow = Workflow(
config=WorkflowConfig(
source_config=source_config,
),
)
source.store.add_workflow(workflow)


for i in range(1, 4):
print_state(workflow.id)
source_response_list = source.lookup(source_config, id=workflow.id)

if source_response_list is None or len(source_response_list) == 0:
break

for source_response in source_response_list:
logger.info(source_response.__dict__)

time.sleep(30)

print_state(workflow.id)
source_response_list = source.lookup(source_config)
for source_response in source_response_list:
logger.info(source_response.__dict__)
5 changes: 4 additions & 1 deletion obsei/misc/youtube_reviews_scrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,9 @@ def _fetch_comments(self, until_datetime: Optional[datetime] = None):
continuations = [sort_menu[self.sort_by]['serviceEndpoint']]
needs_sorting = False
continue
raise RuntimeError('Failed to set sorting')
# TODO: Fix it. Causing observer to fail silently\
logger.warning("Unable to set sorting")
# raise RuntimeError('Failed to set sorting')

actions = list(self._search_dict(response, 'reloadContinuationItemsCommand')) + \
list(self._search_dict(response, 'appendContinuationItemsAction'))
Expand All @@ -125,6 +127,7 @@ def _fetch_comments(self, until_datetime: Optional[datetime] = None):
# Process continuations for comments and replies.
continuations[:0] = [ep for ep in self._search_dict(item, 'continuationEndpoint')]
if self.fetch_replies:
# TODO: Fix it. This functionality is broken
if action['targetId'].startswith('comment-replies-item') and 'continuationItemRenderer' in item:
# Process the 'Show more replies' button
continuations.append(next(self._search_dict(item, 'buttonRenderer'))['command'])
Expand Down
2 changes: 1 addition & 1 deletion obsei/sink/zendesk_sink.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def __init__(self, **data: Any):
self.cred_info = self.cred_info or ZendeskCredInfo()

def get_endpoint(self) -> str:
sub_prefix = "" if self.subdomain is None or self.subdomain is '' else f"/{self.subdomain}."
sub_prefix = "" if self.subdomain is None or self.subdomain == '' else f"/{self.subdomain}."
return f'{self.scheme}://{sub_prefix}{self.domain}{self.ticket_api}'


Expand Down
5 changes: 1 addition & 4 deletions obsei/source/reddit_scrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,7 @@ def lookup(self, config: RedditScrapperConfig, **kwargs) -> List[TextPayload]:
)
lookup_period: str = scrapper_stat.get("since_time", config.lookup_period)
lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD
if len(lookup_period) <= 5:
since_time = convert_utc_time(lookup_period)
else:
since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN)
since_time = convert_utc_time(lookup_period)

last_since_time: datetime = since_time

Expand Down
1 change: 1 addition & 0 deletions requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pandas
nltk
gnews
python-facebook-api
dateparser

## GPL dependencies (these are optional)
# trafilatura

0 comments on commit 982ea9a

Please sign in to comment.