In [None]:
import pandas as pd
import os
from src.parallel_score.parallel_score import process_batch

def assign_partition_key(df, n_partitions):
    df['partition_key'] = df.index % n_partitions
    return df

def print_statistics(df, key):
    print(f'Source size: {df.shape}')

    # Find number of rows by LOC_ID
    data_by_key_df = df[key].value_counts()

    total = 0
    for loc_id in data_by_key_df.index:
        total += data_by_key_df[loc_id]
        # print(f'LOC_ID: {loc_id} - {data_by_key_df[loc_id]}')

    print(f'Total: {total} records, {len(data_by_key_df)} {key}s')
    return data_by_key_df

In [None]:
src_df = pd.read_csv('data\source_comments.csv')
df_partitioned = assign_partition_key(src_df, 20)
df_partitioned.to_csv('data\source_comments_partitioned.csv', index=False)
print('====Partition statistics====')
print_statistics(df_partitioned, 'partition_key')

In [None]:
out_data = process_batch(df_partitioned)

out_data.to_csv(os.getcwd() + '\\results\local_resulys.csv', index=False)

In [None]:
import pandas as pd

source_df = pd.read_csv('data\source_comments_partitioned.csv')
source_non_empty_df = source_df.dropna(subset=['FEEDBACK'])
r1_df = pd.read_csv('results\consolidated_results_3.csv')
r1_df.head(1)


In [None]:
# # Add partition keys 1-20 to the input data such that each partition has equal number of rows
# source_non_empty_df['partition_key'] = source_non_empty_df.index % 15 + 1
# source_non_empty_df.to_csv('data\source_comments_partitioned.csv', index=False)

In [None]:
print('====Source statistics====')
print_statistics(source_non_empty_df, 'partition_key')

print('====Results statistics====')
x1 = print_statistics(r1_df, 'COMPLETION')

# sort by count and print first 5
x1.sort_values(ascending=False).head(5)

In [None]:
earliest = r1_df['START_TIME'].min()
latest = r1_df['END_TIME'].max()
total_time = latest - earliest
size = r1_df.shape[0]
processing_rate = size / total_time
total_time_minutes = total_time / 60

# Calculate processing time per record
r1_df['PROCESSING_TIME'] = r1_df['END_TIME'] - r1_df['START_TIME']
average_processing_time = r1_df['PROCESSING_TIME'].mean()

retry_count = r1_df['RETRY'].value_counts()

print(f'Records Processed: {size}')
print(f'Processing time: {total_time_minutes} minutes')
print(f'Processing rate: {processing_rate} records per second')
print(f'Average processing time: {average_processing_time} seconds')

print(f'=====Retries===')
for retry in retry_count.index:
    print(f'{retry}: {retry_count[retry]}')


In [None]:
errors_df = r1_df[r1_df['COMPLETION'] == 'Ratelimit Error']
print(f'Ratelimit: {errors_df.shape[0]}')

errors_df = r1_df[r1_df['COMPLETION'] == 'Error']
print(f'Errors: {errors_df.shape[0]}')

In [None]:
# Count columns where Completion is not 'None'
non_none_df = r1_df[r1_df['COMPLETION'] != None]
print(f'Non-None: {non_none_df.shape[0]}')