Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUG] Upgrade dependencies and add dateparser in dependency list #252

Merged
merged 3 commits into from
Jul 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions ATTRIBUTION.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ This could not have been possible without following open source software -
- [gnews](https://github.com/ranahaani/GNews): For Google News integration
- [python-facebook-api](https://github.com/sns-sdks/python-facebook): For facebook integration
- [youtube-comment-downloader](https://github.com/egbertbouman/youtube-comment-downloader): For Youtube video comments extraction code
- [dateparser](https://github.com/scrapinghub/dateparser): To parse date properly (where format is ambiguous)
1 change: 0 additions & 1 deletion example/pandas_sink_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
ClassificationAnalyzerConfig,
ZeroShotClassificationAnalyzer,
)
from obsei.misc.utils import obj_to_json
from obsei.sink.pandas_sink import PandasSink, PandasSinkConfig
from obsei.source.playstore_scrapper import (
PlayStoreScrapperConfig,
Expand Down
29 changes: 4 additions & 25 deletions example/reddit_scrapper_example.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
import logging
import sys
import time
from datetime import datetime, timedelta

import pytz

from obsei.misc.utils import DATETIME_STRING_PATTERN
from obsei.source.reddit_scrapper import RedditScrapperConfig, RedditScrapperSource
from obsei.workflow.store import WorkflowStore
from obsei.workflow.workflow import Workflow, WorkflowConfig


def print_state(id: str):
Expand All @@ -26,26 +23,8 @@ def print_state(id: str):
lookup_period=since_time.strftime(DATETIME_STRING_PATTERN),
)

source = RedditScrapperSource(store=WorkflowStore())
source = RedditScrapperSource()

workflow = Workflow(
config=WorkflowConfig(
source_config=source_config,
),
)
source.store.add_workflow(workflow)


for i in range(1, 4):
print_state(workflow.id)
source_response_list = source.lookup(source_config, id=workflow.id)

if source_response_list is None or len(source_response_list) == 0:
break

for source_response in source_response_list:
logger.info(source_response.__dict__)

time.sleep(30)

print_state(workflow.id)
source_response_list = source.lookup(source_config)
for source_response in source_response_list:
logger.info(source_response.__dict__)
5 changes: 4 additions & 1 deletion obsei/misc/youtube_reviews_scrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,9 @@ def _fetch_comments(self, until_datetime: Optional[datetime] = None):
continuations = [sort_menu[self.sort_by]['serviceEndpoint']]
needs_sorting = False
continue
raise RuntimeError('Failed to set sorting')
# TODO: Fix it. Causing observer to fail silently\
logger.warning("Unable to set sorting")
# raise RuntimeError('Failed to set sorting')

actions = list(self._search_dict(response, 'reloadContinuationItemsCommand')) + \
list(self._search_dict(response, 'appendContinuationItemsAction'))
Expand All @@ -125,6 +127,7 @@ def _fetch_comments(self, until_datetime: Optional[datetime] = None):
# Process continuations for comments and replies.
continuations[:0] = [ep for ep in self._search_dict(item, 'continuationEndpoint')]
if self.fetch_replies:
# TODO: Fix it. This functionality is broken
if action['targetId'].startswith('comment-replies-item') and 'continuationItemRenderer' in item:
# Process the 'Show more replies' button
continuations.append(next(self._search_dict(item, 'buttonRenderer'))['command'])
Expand Down
2 changes: 1 addition & 1 deletion obsei/sink/zendesk_sink.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def __init__(self, **data: Any):
self.cred_info = self.cred_info or ZendeskCredInfo()

def get_endpoint(self) -> str:
sub_prefix = "" if self.subdomain is None or self.subdomain is '' else f"/{self.subdomain}."
sub_prefix = "" if self.subdomain is None or self.subdomain == '' else f"/{self.subdomain}."
return f'{self.scheme}://{sub_prefix}{self.domain}{self.ticket_api}'


Expand Down
5 changes: 1 addition & 4 deletions obsei/source/reddit_scrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,7 @@ def lookup(self, config: RedditScrapperConfig, **kwargs) -> List[TextPayload]:
)
lookup_period: str = scrapper_stat.get("since_time", config.lookup_period)
lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD
if len(lookup_period) <= 5:
since_time = convert_utc_time(lookup_period)
else:
since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN)
since_time = convert_utc_time(lookup_period)

last_since_time: datetime = since_time

Expand Down
1 change: 1 addition & 0 deletions requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pandas
nltk
gnews
python-facebook-api
dateparser

## GPL dependencies (these are optional)
# trafilatura