# Collect Training Data 

Thanks for helping me. Run this, it might take several hours. 

When it's done, send me `testing_results.csv`. Thanks!

In [1]:
import psutil
import pandas as pd
import time
import random
import os
from datetime import datetime

# results of all tests
date = datetime.now().strftime("%Y%m%d__%I%M_%p")
result_file = f'testing_results_{date}.csv'
print(result_file)

testing_results_20210426__1012_AM.csv


##### Helper Functions

In [2]:
def get_cpu_stats():
    stats={}
    stats['cpu_name'] = os.uname()[1]
    stats['mem_total'] = psutil.virtual_memory()._asdict()['total']
    stats['mem_avail'] = psutil.virtual_memory()._asdict()['available']
    stats['mem_free'] = psutil.virtual_memory()._asdict()['free']
    stats['mem_pctavail'] = psutil.virtual_memory().available * 100 / psutil.virtual_memory().total
    stats['cpu_virt_count'] = psutil.cpu_count(logical=True)
    stats['cpu_core_count'] = psutil.cpu_count(logical=False)
    stats['sys_boot_time'] = psutil.boot_time()
    return stats

def get_df_stats(df):
    stats={}
    stats['df_size'] = df.size
    stats['df_rows'] = df.shape[0]
    stats['df_cols'] = df.shape[1]
    stats['df_mem_used'] = df.memory_usage(deep=True).sum()
    stats['df_row_size'] = df.memory_usage(deep=True).sum() / df.shape[0]
    return stats

def get_stats(df):
    cpustats = get_cpu_stats()
    dfstats = get_df_stats(df)
    dfstats.update(cpustats)
    return dfstats


#### List of files and directories (of csvs) to test

In [3]:
files = [
'https://s3.amazonaws.com/datarobot_public_datasets/weekly_Federer_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/emr_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/hosp_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/opnp_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/opp_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/ofnp_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/ofp_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/spot-prices_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/Heller_Motor_sev_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/PSID_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/TexasClosedClaims_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/NCAAB2009_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/spotprices_train_OTP.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/New_York_Mets_Ian_11_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/New_York_Mets_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/All_Players_All_Years_wconf_b_t_ht_wt_pos_predict_role_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/MedExp_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/DR_Demo_Time_Series_Sales.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/tmathssk_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/treadssk_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/TexasClosedClaims.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/FrenchMotor1_sev_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/nfl_stats_reg_80.csv',
'http://s3.amazonaws.com/datarobot_data_science/test_data/New_York_Mets_Ian_11.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/AutoClaims_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/mlcomp1452_AlphaProteins_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/RecessionData_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/mlcomp701_Sales_7756-2._train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/pumadyn_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/PregnancyOutcomes_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/wunderground_Chicago_actual_min_temp_calendar_effect_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/wunderground_Chicago_actual_min_temp_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/wunderground_Chicago_actual_precipitation_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/amis_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/blakhol_rg_test_5_train_converted_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/exploratory_testing/time_series/multiseries_calendars/sales_multiseries_training.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/add10_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/sarcos-robot-10000_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/fastiron-train-sample_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/amazon_fr_reviews_small_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/amazon_de_reviews_small_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/retail_japanese_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/Facebook-TotalClicks_calendar_effect_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/kdd_1998_small_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/crowd_text_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/msd_small_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/ABProteins_train_converted_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/Google-TotalClicks_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/Google-TotalSpend_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/weather_small_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/OnCampusArrests_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/LMx_training_sub_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/census_2012h_small_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/dr-commit-activity_calendar_effect_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/mlcomp1220_User-Event-Features_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/DR_Demo_NBA_2017-2018.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/DR_Demo_Pred_Main_Reg.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/amazon_de_reviews_small.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/de_reviews_small_multiline.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/radon-activity_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/AmericanTime2010Edited_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/TimeSeriesAnomalyDetection/device_failure/device_failure_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/census_1990_small_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/DR_Demo_AWS_Job_Optimization.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/BostonPublicRaises_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/DoctorContacts_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/allegan_weather_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/French_THEFT_claim_frequency_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/French_THEFT_claim_count_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/SP500_with_technical_multi_nogaps_10M.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/dr-git-activity-tsmulti-daily.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/dr-git-activity-tsmulti-daily-weights.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/exploratory_testing/cross_series/dr-git-activity-tsmulti-daily-weights-cs.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/BudgetFood_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/mlcomp551_etf300-fractal-f20-126_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/300p_PA_All_Players_All_Info_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/DR_Demo_Statistical_Case_Estimates.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/DR_Demo_Statistical_Case_Estimates.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/DR_Demo_Temporal_Nature_Of_Claims.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/rwm_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/rollingsales_brooklyn_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/VietNamI_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/el-nino-friendly-30000_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/accident_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/AirlineFlights2008-reduced-35000_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/FrenchMotor1_claim_count_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/FrenchMotor1_claim_count_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/FrenchMotor1_cost_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/FrenchMotor1_cost_80_as_logged_offset.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/FrenchMotor1_claim_frequency_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/FrenchMotor1_pure_premium_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/fpp_aus_departures_permanent_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/LoanStats3a-friendly_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/text_data_only/ms_text_data_4000x10.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/sarcos-robot_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/DR_Demo_Stock_Dispersion.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/allstate-nonzero_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/French_DAMAGE_claim_count_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/French_TP_claim_count_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/French_WINDSCREEN_claim_count_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/French_THEFT_cost_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/French_WINDSCREEN_cost_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/French_DAMAGE_cost_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/French_TP_cost_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/French_THEFT_pure_premium_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/French_DAMAGE_claim_frequency_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/French_TP_claim_frequency_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/French_WINDSCREEN_claim_frequency_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/French_DAMAGE_pure_premium_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/French_TP_pure_premium_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/French_WINDSCREEN_pure_premium_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/missing_synthetic_number_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/lib_numeric_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/gdelt_wti_daily_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/Depot_text_cosine_sim_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/movies_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/ufo_awesome_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/DR_Demo_LifetimeCustomerValue.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/exploratory_testing/foreign_datasets/LCData_JP.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/DR_Demo_InsuranceExposure.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/syph.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/Heller_Motor_claim_count_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/Heller_Motor_cost_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/Heller_Motor_pure_premium_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/Heller_Motor_claim_frequency_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/AirlineFlights2008-reduced-70000_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/Insurer3_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/Insurer4_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/Insurer2_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/Insurer1_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/leakage/Motorcycle_insurance_claims_leak.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/kdd_1998_full_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/UML-bus-times_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/bio_wide_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/weather_large_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/GES12_no_unknown_cases_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/rotten_tomatoes_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/burncpu_train_OTP.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/el-nino-friendly_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/BurnCPUBurn_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/stateDataCategorical_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/PredictFix_train_comments_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/DR_Demo_LossGivenDefault.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/StarLightCurves_10M.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/safer/yelp_business_rating.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/SP500_multi_nogaps_10M.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/fastiron-train-full_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/fastiron_train_OTP.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/AirFuel_mix_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/msd_full_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/ClickPrediction80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/SP500_multi_nogaps_10M_weight.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/msd_full_CV5.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/northwind_product_orders_amounts_by_day_calendar_effect_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/kiva_loans_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/ny_toll_vehicle_count_by_day_2006-2013.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/ny_toll_vehicle_count_by_day_2006-2013.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/store_sales_mix_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/sales_0.25gr_with_gap.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/sales_0gr_with_gap.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/rossman_kaggle_train_store_10M_series.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/rossman_kaggle_train_store_10M_series_mixed_dtypes.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/allstate_reg_subset_1095605_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/rossmann_kaggle_train_long_store_935_w0.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/mer_text_combo_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/exploratory_testing/time_series/multiseries_calendars/dr-git-activity-tsmulti-hourly_calendar_effect_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/yoga_10M.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/census_1990_full_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/allstate-sample-50.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/exploratory_testing/cross_series/japanese_euro_date_cs.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/exploratory_testing/time_series/unsupervised_mode/excess_return_correct/excess-return-correct_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/DR_Demo_Google_AdWords_WithLags.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/DR_Demo_Google_AdWords_WithLags.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/time_series/iso_ne_millis_2500_with_classification.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/Tools_and_Stats_for_college_players_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/exploratory_testing/otv_monotonic_constraints/InsuranceDemoWithPostcode50k_OTV.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/InsuranceDemoWithPostcode50k.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/leakage/phone_gender_narrow_leak.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/grockit_train_small_no_outcome_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/train_swing_and_miss_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/r_recommend_full_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/subreddit_text_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/subreddit_text_cosine_sim_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/F_6Features_train_converted_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/HIGGS-110000_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/KDD14_text_features_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/mini_boone_full_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/bloggers_small_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/credit-train-full_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/28_Features_split_train_converted_train80_CVTVH3.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/28_Features_split_train_converted_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/rottentomatoes_sentiment_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/recipe_text_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/dr-git-activity-tsmulti-hourly-classification.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/william-trainingNA_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/home_ins_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/bloggers_text_cosine_sim_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/repere-fusion-spk_train_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/quora_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/exploratory_testing/cross_series/LCData_JP_cs.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/dave-drTestSet_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/donors_choose_train.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/covtype_full_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/safer/donors_choose.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/chargeback_clean_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/chargeback_train_OTP.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/mortDefault2009_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/large_xor_text_japanese.csv_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/allstate_classif_subset_1095605_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/exported_products_80.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/exported_products_CVTVH3.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/criteo_20pct_in_mem_5GB.csv',
'https://s3.amazonaws.com/datarobot_public_datasets/avazu_train_in_mem_10GB.csv']

directories = ['/Users/josh.berry/Downloads']


In [5]:
from os import listdir
def find_csv_filenames(path_to_dir, suffix=".csv" ):
    filenames = listdir(path_to_dir)
    return [ path_to_dir + '/' + filename for filename in filenames if filename.endswith( suffix ) ]

for dirs in directories:
    dir_files = find_csv_filenames(dirs)
    files = list(set(files + dir_files))

#### Execute

Totally okay and realistic to do other things on your computer while this runs. It's gathering CPU stats like free memory so this shouldn't be a problem.

In [None]:
# configs (do not need to change)
numbers = range(50,150)
temp_file = 'temp.csv'
counter=0

for file in files:
    
    # randomize how many tests per file
    tests_per_file = random.choice([number for number in numbers])
    
    # read in data to dataframe
    print(f'\rRunning {tests_per_file} tests on {str(file)} >> importing...', end="")
    indata = pd.read_csv(file, low_memory=False)
    if indata.shape[0] <= 500:
        break
    
    tests = []
    
    # only get stats after data is loaded
    stats = get_stats(indata)
    
    # calculate increment for chunksize based on number of tests
    max_chunk = min(300000,indata.shape[0])
    min_chunk = 100
    increment = int((max_chunk - min_chunk) / tests_per_file)

    # force test specific sizes which end up being popular
    forcesizes = [100,150,200,250,300,350,400,500]
    randsizes = [c for c in range(min_chunk, max_chunk, increment)]
    
    # randomize order of tests
    sizes = list(set(forcesizes + randsizes))
    random.shuffle(sizes)
    
    # run all tests
    for chnksz in sizes:
        print(f'\rRunning {tests_per_file} tests on {str(file)} >> Chunksize: {chnksz}', end="")
        start = time.time()
        indata.to_csv(temp_file, index=False, chunksize=chnksz)
        stop = time.time()
        elapsed = stop - start
        tests.append({'chunksize': chnksz, 'duration':elapsed})
    
    # per file - merge all test-results, write to file
    a = pd.DataFrame(tests)
    b = pd.DataFrame.from_records([stats])
    temp = a.merge(b, how='cross')
    if counter==0:
        temp.to_csv(result_file, header=True, index=False)
    else:
        temp.to_csv(result_file, mode='a', header=False, index=False)
    counter+=1
    
    # cleanup in case python cheats on subsequent runs
    del a
    del temp
    del indata
    
    print("\r" + " "*300, end="", flush=True)
print("\rDONE!" + " "*300, end="", flush=True)

Running 118 tests on https://s3.amazonaws.com/datarobot_public_datasets/AirFuel_mix_train_80.csv >> importing...                                                                                                                                                                                            