In [1]:
import pandas as pd
import numpy as np
import random
import os, glob

In [2]:
data_dir = '../data/'

# Extract information from filenames and save as table

In [3]:
# read the files
files = glob.glob(data_dir + '/**/*.txt', recursive=True)
files[:5]

['../data/data_states/massratio10/1032.txt',
 '../data/data_states/massratio10/54.txt',
 '../data/data_states/massratio10/856.txt',
 '../data/data_states/massratio10/377.txt',
 '../data/data_states/massratio10/64.txt']

In [4]:
# get mass ratios (divide by 10 to get true ratio)
massratios=[file.split('/')[-2][9:] for file in files]
massratios=[mr if mr=='inf' else mr[:-1] for mr in massratios]
massratios[:5]

['1', '1', '1', '1', '1']

In [5]:
# get state numbers
state_numbers = [int(file.split('/')[-1].split('.')[0]) for file in files]
state_numbers[:5]

[1032, 54, 856, 377, 64]

In [6]:
# assign an integrability label
def check_integrability(mass_ratio):
    if mass_ratio=='10' or mass_ratio == 'inf':
        return True
    else: 
        return False
is_integrable = [check_integrability(rat) for rat in massratios]
is_integrable[:5]

[False, False, False, False, False]

In [7]:
# coerce to data frame
df = pd.DataFrame({'path': files,
                   'mass_ratio': massratios,
                   'state_number': state_numbers,
                   'is_integrable': is_integrable}).\
sort_values(['mass_ratio', 'state_number']).\
reset_index(drop=True)

In [8]:
df

Unnamed: 0,path,mass_ratio,state_number,is_integrable
0,../data/data_states/massratio10/0.txt,1,0,False
1,../data/data_states/massratio10/1.txt,1,1,False
2,../data/data_states/massratio10/2.txt,1,2,False
3,../data/data_states/massratio10/3.txt,1,3,False
4,../data/data_states/massratio10/4.txt,1,4,False
...,...,...,...,...
4195,../data/data_states/massratioinf/1045.txt,inf,1045,True
4196,../data/data_states/massratioinf/1046.txt,inf,1046,True
4197,../data/data_states/massratioinf/1047.txt,inf,1047,True
4198,../data/data_states/massratioinf/1048.txt,inf,1048,True


# Get train/test split

In [9]:
test_ratio=0.15
test_num = int(test_ratio*df.shape[0])
print("Number of test samples:", test_num)
test_ix = random.sample(range(df.shape[0]), test_num)

Number of test samples: 630


In [10]:
# set the split variable
df["is_test"] = False
df.iloc[test_ix, -1]=True

In [11]:
df.sample(10)

Unnamed: 0,path,mass_ratio,state_number,is_integrable,is_test
1467,../data/data_states/massratio20/417.txt,2.0,417,False,True
2490,../data/data_states/massratio50/390.txt,5.0,390,False,False
43,../data/data_states/massratio10/43.txt,1.0,43,False,False
3665,../data/data_states/massratioinf/515.txt,inf,515,True,False
2785,../data/data_states/massratio50/685.txt,5.0,685,False,False
100,../data/data_states/massratio10/100.txt,1.0,100,False,False
193,../data/data_states/massratio10/193.txt,1.0,193,False,False
559,../data/data_states/massratio10/559.txt,1.0,559,False,True
2434,../data/data_states/massratio50/334.txt,5.0,334,False,False
2792,../data/data_states/massratio50/692.txt,5.0,692,False,False


In [12]:
df.value_counts('is_test')

is_test
False    3570
True      630
dtype: int64

In [13]:
df.to_csv(data_dir+'states.csv')

# Load the images

In [14]:
# this function 
def load_image_array(path):
    a = np.loadtxt(path, delimiter=' ')
    return a

In [15]:
images = df.path.apply(lambda x: np.loadtxt(x, delimiter=' '))

In [16]:
df['image'] = images

In [17]:
df.head()

Unnamed: 0,path,mass_ratio,state_number,is_integrable,is_test,image
0,../data/data_states/massratio10/0.txt,1,0,False,False,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,../data/data_states/massratio10/1.txt,1,1,False,False,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,../data/data_states/massratio10/2.txt,1,2,False,True,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,../data/data_states/massratio10/3.txt,1,3,False,False,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,../data/data_states/massratio10/4.txt,1,4,False,True,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


# Pickle it

In [18]:
df.to_pickle(data_dir+'states.pkl')