Skip to content

Commit d445092

Browse files
authored
Merge pull request #7 from MichaelC1999/return-type-refactor
Return type refactor
2 parents 0b154de + b8d7cc4 commit d445092

File tree

2 files changed

+63
-66
lines changed

2 files changed

+63
-66
lines changed

README.md

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -39,33 +39,27 @@ In order to poll the substream, you will need to call the `poll()` function on t
3939
print(sb.output_modules)
4040

4141
# Poll the module and return a list of SubstreamOutput objects in the order of the specified modules
42-
result = sb.poll(["store_swap_events"], start_block=10000835, end_block=10000835+20000)
42+
result = sb.poll("map_swap_events", start_block=10000835, end_block=10000835+20000)
4343
```
4444

4545
With the default inputs, this function outputs Pandas Dataframes after streaming all blocks between the start_block and end_block. However depending on how this function is called, a dict object is returned. The `poll()` function has a number of inputs
4646

4747
- output_modules
48-
- List of strings of output modules to stream
48+
- String of the output module to stream
4949
- start_block
5050
- Integer block number to start the polling
5151
- end_block
5252
- Integer block number to end the polling. In theory, there is no max block number as any block number past chain head will stream the blocks in real time. Its recommended to use an end_block far off into the future if building a data app that will be streaming datain real time as blocks finalize, such as block 20,000,000
53-
- stream_callback
54-
- An optional callback function to be passed into the polling function to execute when valid streamed data is received
5553
- return_first_result
5654
- Boolean value that if True will return data on the first block after the start block to have an applicable TX/Event.
5755
- Can be called recursively on the front end while incrementing the start_block to return data as its streamed rather than all data at once after streaming is completed
5856
- Defaults to False
59-
- If True, the data is returned in the format {"data": [], "module_name": String, "data_block": int}
6057
- initial_snapshot
6158
- Boolean value, defaults to False
62-
- highest_processed_block
63-
- Integer block number that is used in measuring indexing and processing progress, in cases where return_progress is True
64-
- Defaults to 0
65-
- return_progress: bool = False,
66-
- Boolean value that if True returns progress in back processing
67-
- Defaults to False
68-
59+
- return_type
60+
- Specifies the type of value to return
61+
- Passing "df" returns the data in a pandas DataFrame
62+
- Passing "dict" returns in the format {"data": [], "module_name": String, "data_block": int, error: str | None}
6963

7064
The result here is the default `SubstreamOutput` object, you can access both the `data` and `snapshots` dataframes by doing:
7165

substreams/substream.py

Lines changed: 57 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
DEFAULT_ENDPOINT = "api.streamingfast.io:443"
1818

19-
2019
def retrieve_class(module_name: str, class_name: str):
2120
module = import_module(module_name)
2221
return getattr(module, class_name)
@@ -126,11 +125,10 @@ def _parse_snapshot_deltas(self, snapshot: dict) -> list[dict]:
126125
for x in snapshot["deltas"].get("deltas", list())
127126
]
128127

129-
def _parse_data_outputs(self, data: dict, module_names: list[str]) -> list[dict]:
128+
def _parse_data_outputs(self, data: dict, module_name: str) -> list[dict]:
130129
outputs = list()
131-
module_set = set(module_names)
132130
for output in data["outputs"]:
133-
if "mapOutput" not in output or output["name"] not in module_set:
131+
if "mapOutput" not in output or output["name"] != module_name:
134132
continue
135133
map_output = output["mapOutput"]
136134
for key, items in map_output.items():
@@ -164,85 +162,90 @@ def proto_file_map(self) -> dict[str, DescriptorProto]:
164162

165163
def poll(
166164
self,
167-
output_modules: list[str],
165+
output_module: str,
168166
start_block: int,
169167
end_block: int,
170-
stream_callback: Optional[callable] = None,
171168
return_first_result: bool = False,
172169
initial_snapshot: bool = False,
173-
highest_processed_block: int = 0,
174-
return_progress: bool = False,
170+
return_type: str = "df"
175171
):
176-
from sf.substreams.v1.substreams_pb2 import STEP_IRREVERSIBLE, Request
177-
for module in output_modules:
178-
if module not in self.output_modules:
179-
raise Exception(f"module '{module}' is not supported for {self.name}")
180-
if self.output_modules[module].get('is_map') is False:
181-
raise Exception(f"module '{module}' is not a map module")
182-
self._class_from_module(module)
183-
184-
stream = self.service.Blocks(
185-
Request(
186-
start_block_num=start_block,
187-
stop_block_num=end_block,
188-
fork_steps=[STEP_IRREVERSIBLE],
189-
modules=self.pkg.modules,
190-
output_modules=output_modules,
191-
initial_store_snapshot_for_modules=output_modules
192-
if initial_snapshot
193-
else None,
194-
)
195-
)
196-
raw_results = defaultdict(lambda: {"data": list(), "snapshots": list()})
172+
173+
return_dict_interface = {"data": [], "module_name": output_module, "data_block": str(start_block), "error": None}
174+
valid_return_types = ["dict", "df"]
197175
results = []
198-
data_block = 0
199-
module_name = ""
176+
raw_results = defaultdict(lambda: {"data": list(), "snapshots": list()})
200177

201178
try:
179+
if isinstance(output_module, str) is False:
180+
raise Exception("The 'output_module' parameter passed into the poll() function is not a string.")
181+
return_type = return_type.lower()
182+
if return_type not in valid_return_types:
183+
return_type = "df"
184+
185+
from sf.substreams.v1.substreams_pb2 import STEP_IRREVERSIBLE, Request
186+
if output_module not in self.output_modules:
187+
raise Exception(f"module '{output_module}' is not supported for {self.name}")
188+
if self.output_modules[output_module].get('is_map') is False:
189+
raise Exception(f"module '{output_module}' is not a map module")
190+
self._class_from_module(output_module)
191+
192+
stream = self.service.Blocks(
193+
Request(
194+
start_block_num=start_block,
195+
stop_block_num=end_block,
196+
fork_steps=[STEP_IRREVERSIBLE],
197+
modules=self.pkg.modules,
198+
output_modules=[output_module],
199+
initial_store_snapshot_for_modules=[output_module]
200+
if initial_snapshot
201+
else None,
202+
)
203+
)
204+
202205
for response in stream:
203206
snapshot = MessageToDict(response.snapshot_data)
204207
data = MessageToDict(response.data)
205-
progress = MessageToDict(response.progress)
206208
session = MessageToDict(response.session)
207209

208210
if session:
209211
continue
210212

211213
if snapshot:
212-
module_name = snapshot["moduleName"]
213214
snapshot_deltas = self._parse_snapshot_deltas(snapshot)
214-
raw_results[module_name]["snapshots"].extend(snapshot_deltas)
215+
raw_results[output_module]["snapshots"].extend(snapshot_deltas)
215216

216217
if data:
217-
parsed = self._parse_data_outputs(data, output_modules)
218-
module_name = data["outputs"][0]["name"]
219-
raw_results[module_name]["data"].extend(parsed)
220-
data_block = data["clock"]["number"]
218+
parsed = self._parse_data_outputs(data, output_module)
219+
raw_results[output_module]["data"].extend(parsed)
220+
return_dict_interface["data_block"] = data["clock"]["number"]
221221
if len(parsed) > 0:
222-
parsed = [dict(item, **{'block':data_block}) for item in parsed]
222+
parsed = [dict(item, **{'block':data["clock"]["number"]}) for item in parsed]
223223
if return_first_result is True:
224224
break
225-
if callable(stream_callback):
226-
stream_callback(module_name, parsed)
227-
else:
228-
continue
229-
elif progress and return_progress is True:
230-
if 'processedBytes' in progress["modules"][0] or 'processedRanges' not in progress["modules"][0]:
231-
continue
232-
endBlock = int(progress["modules"][0]['processedRanges']['processedRanges'][0]['endBlock'])
233-
data_block = endBlock
234-
if endBlock > highest_processed_block + 100 and progress["modules"][0]['name'] == output_modules[0]:
235-
return {"block": int(endBlock)}
236-
if return_first_result is True:
237-
return {"data": parsed, "module_name": module_name, "data_block": data_block}
238-
for output_module in output_modules:
225+
elif int(return_dict_interface["data_block"]) + 1 == end_block:
226+
results = return_dict_interface
227+
228+
if return_first_result is True and parsed:
229+
return_dict_interface["data"] = parsed
230+
if return_type == "dict":
231+
results = return_dict_interface
232+
if return_type == "df":
233+
results = pd.DataFrame(parsed)
234+
if return_first_result is False and raw_results:
239235
result = SubstreamOutput(module_name=output_module)
240236
data_dict: dict = raw_results.get(output_module)
241237
for k, v in data_dict.items():
242238
df = pd.DataFrame(v)
243239
df["output_module"] = output_module
244240
setattr(result, k, df)
245241
results.append(result)
242+
if return_type == "dict":
243+
return_dict_interface["data"] = results.to_dict()
244+
results = return_dict_interface
246245
except Exception as err:
247-
results = {"error": err}
246+
error_to_pass = err
247+
if isinstance(err, Exception):
248+
error_to_pass = str(err)
249+
return_dict_interface["error"] = error_to_pass
250+
results = return_dict_interface
248251
return results

0 commit comments

Comments
 (0)