# Create the input for the model

In [20]:
import pandas as pd
import json
import itertools
import math
import numpy as np

## 1. Load and prepare the actionshistory json

In [14]:
# Load the actionshistory
data = pd.read_json('/nfshome/llayer/data/actionshistory.json', orient='index')

# Rest index
data_index_reset = data.reset_index()
data_index_reset = data_index_reset.rename(columns={'index': 'task_name'})

### Add informations about sites and error codes

In [15]:
# Add column with list of exit code for good/bad sites
def exit_codes(row, att):
     sites = row[att]
     return [ site for site in sites ] if len(sites.keys()) != 0 else ['0']

data_index_reset['errors_good_sites_exit_codes'] = data_index_reset['errors'].apply(lambda x: exit_codes(x, 'good_sites'))
data_index_reset['errors_bad_sites_exit_codes'] = data_index_reset['errors'].apply(lambda x: exit_codes(x, 'bad_sites')) 

def list_of_sites(row, att):
    sites = row[att].values()
    sites_list = []
    if len(sites) != 0:
        sites_list = [item.keys() for item in sites]
    else:
        sites_list = ['NA']
    return sites

# Add column of list of good/bad sites
data_index_reset['errors_good_sites_list'] = data_index_reset['errors'].apply(lambda x:  list_of_sites(x, 'good_sites'))

data_index_reset['errors_bad_sites_list'] = data_index_reset['errors'].apply(lambda x:  list_of_sites(x, 'bad_sites'))

In [16]:
# Exit code based on good_sites
list2d = data_index_reset['errors_good_sites_exit_codes'].tolist()
good_sites_exit_codes = sorted(set(list(itertools.chain.from_iterable(list2d))),key=int)
good_sites_exit_codes = [str(x) for x in good_sites_exit_codes]

# Extract exit code based on bad_sites
list2d = data_index_reset['errors_bad_sites_exit_codes'].tolist()
bad_sites_exit_codes = sorted(set(list(itertools.chain.from_iterable(list2d))),key=int)
bad_sites_exit_codes = [str(x) for x in bad_sites_exit_codes]

# Extract site names from good sites 
list2d_step1 = data_index_reset['errors_good_sites_list'].tolist()
list2d_step2 = list(itertools.chain.from_iterable(list2d_step1))
good_site_names = sorted(set(list(itertools.chain.from_iterable(list2d_step2))))
good_site_names = [x.encode('utf-8') for x in good_site_names]

# Extract site names from bad sites
list2d_step1 = data_index_reset['errors_bad_sites_list'].tolist()
list2d_step2 = list(itertools.chain.from_iterable(list2d_step1))
bad_site_names = sorted(set(list(itertools.chain.from_iterable(list2d_step2))))
bad_site_names = [x.encode('utf-8') for x in bad_site_names]

### Add the labels

In [17]:
def xrootd_fnc(x, column):
    # if isinstance(x.keys(), dict): 
    if column in x.keys():
        return str(x[column])

    else:
        return str('NaN')


def splitting_fnc(x, column):
    if column in x.keys():
        return str(x[column])

    else:
        return str('1x')


def merge_labels(x, features):
    merged_label = '_'.join(x[features])

    return merged_label

# Add column with splitting categorical levels
data_index_reset['splitting'] = data_index_reset['parameters'].apply(lambda x:
                                                                     splitting_fnc(x,
                                                                                'splitting'))

splitting_categories = sorted(list(set(data_index_reset['splitting'])))
print(splitting_categories)

# Encode splitting categorical levels
data_index_reset['splitting_encoded'] = data_index_reset['splitting'].astype(pd.api.types.CategoricalDtype(categories =
                                                                                                           splitting_categories)).cat.codes

# Set 'action' as the target
data_index_reset['action'] = data_index_reset['parameters'].apply(lambda x: x['action'])

action_categories = sorted(list(set(data_index_reset['action'])))
print(set(data_index_reset['action']))
print(set(data_index_reset['splitting']))

# Encode action categorical levels
data_index_reset['action_encoded'] =  data_index_reset['action'].astype(pd.api.types.CategoricalDtype(categories =
                                                                                                      action_categories)).cat.codes

data_index_reset['action_encoded'].value_counts()



# Target categorical levels
data_index_reset['target_label'] = data_index_reset.apply(lambda x:
                                                          merge_labels(x,
                                                                       ['action',
                                                                       'splitting']),
                                                          axis=1)

print(set(data_index_reset['target_label']))
print(len(set(data_index_reset['target_label'])))
print(data_index_reset['target_label'].value_counts())
target_categories = sorted(list(set(data_index_reset['target_label'])))
print(target_categories)

# Encode target categorical levels
data_index_reset['target_encoded'] = data_index_reset['target_label'].astype(pd.api.types.CategoricalDtype(categories =
                                                                                                           target_categories)).cat.codes

data_index_reset['target_encoded'].value_counts()

labels = list(set(data_index_reset['target_encoded'].tolist()))
print(labels)

# create a binary classificaiton column
data_index_reset['action_label'] = data_index_reset['action'].apply(lambda x: 'acdc' if x == 'acdc' else 'non_acdc')

['100x', '10x', '1x', '20x', '2x', '3x', '50x', 'max']
set([u'by-pass', u'clone', u'acdc', u'on-hold'])
set(['max', '3x', '20x', '100x', '2x', '10x', '50x', '1x'])
set([u'acdc_max', u'clone_max', u'acdc_20x', u'clone_1x', u'acdc_1x', u'by-pass_1x', u'acdc_100x', u'acdc_3x', u'acdc_2x', u'clone_3x', u'on-hold_1x', u'acdc_50x', u'clone_2x', u'clone_10x', u'acdc_10x'])
15
acdc_1x       23293
clone_1x       1326
acdc_10x        222
on-hold_1x      134
acdc_2x          29
acdc_20x         20
acdc_max         16
clone_10x        12
acdc_100x        12
by-pass_1x        9
clone_max         6
clone_3x          5
acdc_50x          3
acdc_3x           2
clone_2x          1
Name: target_label, dtype: int64
[u'acdc_100x', u'acdc_10x', u'acdc_1x', u'acdc_20x', u'acdc_2x', u'acdc_3x', u'acdc_50x', u'acdc_max', u'by-pass_1x', u'clone_10x', u'clone_1x', u'clone_2x', u'clone_3x', u'clone_max', u'on-hold_1x']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]


## 2. Load and prepare the w2v file

In [41]:
w2v = pd.read_csv('/nfshome/llayer/data/df_word2vec_exitcodes.csv')

In [42]:
# Convert the word vectors from string back to float
def str_to_float(row):
    log_msg = row['w2v']
    msg = list(np.float_(log_msg.replace('[','').replace(']', '').split(',')))
    return msg
w2v['w2v'] = w2v.apply(str_to_float, axis=1)

In [46]:
# Create lists with the error, site, message per taskname
w2v_list = w2v.groupby(['task_name'], 
                  as_index=False)['error', 'site', 'w2v'].agg(lambda x: list(x))

In [47]:
w2v_list.head()

Unnamed: 0,task_name,error,site,w2v
0,/amaltaro_Run2018A-v1-DoubleMuon-17Sep2018_102...,[85],[T1_UK_RAL],"[[-0.838090360204, 0.457981392001, 0.254296811..."
1,/amaltaro_Run2018A-v1-DoubleMuon-17Sep2018_102...,[50664],[T2_DE_RWTH],"[[-0.182677972648, 0.899555946735, 0.567435338..."
2,/amaltaro_Run2018A-v1-DoubleMuon-17Sep2018_102...,[50664],[T2_DE_RWTH],"[[-0.182677972648, 0.899555946735, 0.567435338..."
3,/amaltaro_Run2018A-v1-DoubleMuon-17Sep2018_102...,"[99400, 50664]","[NoReportedSite, T2_DE_RWTH]","[[-0.484312460839, -0.207195970489, 0.34691437..."
4,/amaltaro_Run2018A-v1-DoubleMuon-17Sep2018_102...,[50664],[T2_DE_RWTH],"[[-0.182677972648, 0.899555946735, 0.567435338..."


## 3. Merge the two frames and build the matrices

In [48]:
df = pd.merge( data_index_reset, w2v_list, on = ['task_name'], how='inner')

In [49]:
test = df[:100]

In [50]:
unique_sites = list(set(good_site_names + bad_site_names)) 
unique_codes = list(set(good_sites_exit_codes + bad_sites_exit_codes))
dim_w2v = len(w2v['w2v'][0])

In [68]:
# Build the site-error matrix table
def build_table(row):
    
    errors = row['errors']
    sites_good = errors['good_sites'] 
    sites_bad = errors['bad_sites']
    log_sites = row['site']
    log_errors = row['error']
    log_msg = row['w2v']
    
    # Create df
    sparse_df = pd.DataFrame(columns=unique_sites, index=unique_codes).fillna(value=0).sort_index()
    sparse_df = sparse_df.astype(object)

    # Add exit code
    # Good sites
    for exit_code, site_dict in zip(sites_good.keys(), sites_good.values()):
        for site, count in site_dict.items():
            #print site, exit_code, count
            #print site, count
            sparse_df.at[str(exit_code), str(site)] = 0 if math.isnan(count) else count
    # Bad sites
    for exit_code, site_dict in zip(sites_bad.keys(), sites_bad.values()):
        for site, count in site_dict.items():
            #print site, exit_code, count
            #print site, count
            sparse_df.at[str(exit_code), str(site)] = 0 if math.isnan(count) else count
    
    # Add word vectors
    if isinstance(log_sites, (list,)):
    #if not math.isnan(log_sites):

        for i in range(len(log_sites)):
            #print i, log_errors[i], log_sites[i]
            #msg = list(np.float_(log_msg[i].replace('[','').replace(']', '').split(',')))
            #msg = [1,2,3]
            #print msg
            if log_sites[i] == 'NoReportedSite':
                continue
            count = sparse_df.at[str(log_errors[i]), str(log_sites[i])] 
            if count != 0:
                print log_msg[i]
                count_msg = [count] + log_msg[i]
                sparse_df.at[str(log_errors[i]), str(log_sites[i])] = count_msg
    
    return sparse_df

In [69]:
test['table'] = test.apply(build_table, axis=1)

[-0.83809036020360972, 0.45798139200082133, 0.25429681182067421, 0.12761653222876237, -0.013735717061174463, -0.35113876559245533, -0.24157938736424658, 0.16293029183186744, 0.063340570351447925, -0.14094460037805792]
[-0.83809036020360972, 0.45798139200082133, 0.25429681182067421, 0.12761653222876237, -0.013735717061174463, -0.35113876559245533, -0.24157938736424658, 0.16293029183186744, 0.063340570351447925, -0.14094460037805792]
[-0.18267797264787886, 0.89955594673476835, 0.5674353383688463, -0.93891632915646939, -1.426904464761416, -0.54681482110862378, -1.1454847518492628, 0.58223614900338427, -1.2735912077887741, -0.79780025014446831]
[-0.18267797264787886, 0.89955594673476835, 0.5674353383688463, -0.93891632915646939, -1.426904464761416, -0.54681482110862378, -1.1454847518492628, 0.58223614900338427, -1.2735912077887741, -0.79780025014446831]
[-0.18267797264787886, 0.89955594673476835, 0.5674353383688463, -0.93891632915646939, -1.426904464761416, -0.54681482110862378, -1.1454847

[-0.18267797264787886, 0.89955594673476835, 0.5674353383688463, -0.93891632915646939, -1.426904464761416, -0.54681482110862378, -1.1454847518492628, 0.58223614900338427, -1.2735912077887741, -0.79780025014446831]
[-0.65463318832803008, 0.56452703389515679, -0.21408508736274992, -0.45835343649111737, -0.2883716815857375, 0.16659419345699883, -0.30114867220164027, 0.097236801090070463, 0.16536376571619055, 0.078063654734637417]
[-0.63028578635337007, 0.2315636542904326, -0.16617512184014793, -0.25931931851946477, -0.32076792354347911, 0.031992767072492344, -0.47973407342488694, 0.17044331432446874, 0.16711570585625476, -0.32217275945899582]
[-0.67737252071700593, 0.20907077368928823, -0.1749288940162281, -0.23071637994492661, -0.26704122661055252, 0.053233127039996629, -0.42927052955349498, 0.17437741590414929, 0.24801052319587205, -0.27421809686347842]
[-0.68926234289205501, 0.37622475731112653, -0.28247135718545269, -0.44793038350189962, -0.34691809760258785, 0.11748277204377311, -0.31

[-0.67341275258457067, 0.38967273840450356, 0.26262239778871804, 0.011979243412034671, 0.12680785513282244, -0.28102903848007166, -0.12704076612361645, 0.15863316181007206, 0.039499861607410328, -0.13707881170413835]
[-0.66917897635398227, 0.37125290933504906, 0.30580207561110401, 0.013768099513577504, 0.19985741240412203, -0.22789065493270755, -0.10417748914915137, 0.13942951031236184, -0.012078449161991658, -0.11439931264289448]
[-0.67484100075024689, 0.37811078020604327, 0.31130360066729029, 0.027972546619518347, 0.18386494149065885, -0.23505112825013283, -0.096129227271174178, 0.1450916952259528, 0.002985654470345859, -0.10190258522763744]
[-0.64325494602972633, 0.36599931564032029, 0.30815870530322043, 0.0060296177976583475, 0.20776478531176612, -0.2277378441528386, -0.10407683350483343, 0.1437229728013677, -0.010283303662709542, -0.11038487848681991]
[-0.68713482167128914, 0.38808390688984751, 0.31140285107580673, 0.032383130570235369, 0.18519061611691764, -0.23902541893970716, -

[-0.71998652287004095, 0.34676738075449381, -0.28841775443003165, -0.44222898073372696, -0.30690593404232774, 0.1190070289314962, -0.25114868997803164, 0.13687053354349737, 0.29814597490331879, -0.047735598598746889]
[0.22469063763316996, 0.64128938439319394, 0.7502051536645955, -0.54030276593079585, -2.4182247585461782, 0.55982992971419454, 0.73598074617998288, 0.55406582370805724, 0.068486221827764954, -0.56610900914960505]
[-0.45697700578236472, 0.22791828006781914, 0.1928044660586036, -0.097640041275708772, 0.19489439750711124, 0.033715042471885684, -0.017630073654085952, 0.11642268276600927, 0.031270118491913849, -0.06569773830544566]
[-0.42363835422107982, 0.24017070212080191, 0.23934305425188163, -0.093066001622467087, 0.23774127591440278, 0.04479065086150652, 0.029961982516422608, 0.10150252780082022, -0.0056249401138654103, -0.022052144626875891]
[-0.690650380996362, 0.49597293165295497, -0.2543219537793161, -0.42372331932212592, -0.25860823851959036, 0.14358609042143167, -0.2

[-0.69175365998026594, 0.48672605270902947, -0.33923495866041231, -0.42307416696418471, -0.31326410388706105, 0.12417502400749329, -0.30095952235043366, 0.1504230851392965, 0.27319969662633586, 0.007975216528915979]
[-0.60100858858335882, 0.54364625347718121, -0.31222467733727349, -0.47602650629536758, -0.23517137412883102, 0.14576161923788247, -0.30421840995249727, 0.043212354147709124, 0.24422986782569017, -0.0033002952864814188]
[-0.72248236941194788, 0.15026269753387056, -0.30370216914083253, -0.45292585195904922, -0.29177928860259716, 0.098125987045108096, -0.30192597013193767, 0.15661889804964463, 0.33362373995209488, -0.26000274288555131]
[-0.5921013089813123, 0.56657869333829314, -0.32242682090324493, -0.49401588941107483, -0.22318437174987352, 0.14742698178032843, -0.32238876135736511, 0.048774651460263663, 0.25756857902207941, -0.011116542521876232]
[-0.60099407702600549, 0.54543557957150157, -0.32532837952358401, -0.48119545055379631, -0.22662985543166422, 0.1445044874126792

[0.043849116398228534, 0.54528665211465621, -0.71334980428218842, -0.85085823138554884, -0.35693239669005072, 0.0080451731466584728, -0.06560330672396554, -0.032848467041427888, 0.19904841524031425, -0.4116169942749871]
[0.061748326889106205, 0.46290815940925051, -0.45653192379644936, -0.76696251119886116, -0.5000431218317577, -0.29993116855621338, -0.132806444806712, 0.14074699101703506, 0.27048619517258232, -0.56433262569563725]
[0.061748326889106205, 0.46290815940925051, -0.45653192379644936, -0.76696251119886116, -0.5000431218317577, -0.29993116855621338, -0.132806444806712, 0.14074699101703506, 0.27048619517258232, -0.56433262569563725]
[-0.13244140731251758, 0.92819768554814486, 0.55737354812900652, -1.0857878298215244, -1.5456242068954136, -0.49252902003734006, -1.2420262623092402, 0.69969310611486435, -1.3670472774816596, -0.82342020576090913]
[-0.51596337073350407, 0.3847518145105196, -0.34596136857358978, -0.1679067624911125, 0.22087283576139413, 0.095617211822623227, -0.0021

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [70]:
pd.set_option('display.max_columns', 500)
test['table'].iloc[5]

Unnamed: 0,Unknown,T3_US_PuertoRico,T2_FI_HIP,T2_UK_SGrid_RALPP,T2_FR_GRIF_LLR,T1_US_FNAL_MSS,T2_UK_London_IC,T3_UK_London_QMUL,T3_US_PSC,T0_CH_CERN_Disk,T2_KR_KNU,T2_RU_SINP,T1_UK_RAL_ECHO_Disk,T3_US_UMD,T3_US_Colorado,T2_CH_CSCS_HPC,T1_UK_RAL_Disk,T3_IT_Napoli,T2_TH_CUNSTDA,T3_US_Kansas,T1_IT_CNAF_Disk,T2_IT_Bari,T3_US_HEPCloud,T2_US_UCSD,T2_RU_IHEP,T3_US_Vanderbilt_EC2,T1_RU_JINR,T2_CH_CERN,T3_BY_NCPHEP,T1_US_FNAL_Disk,T3_US_UCR,T3_TW_NCU,T1_IT_CNAF_MSS,T2_CH_CSCS,T2_UA_KIPT,T2_PK_NCP,T2_RU_PNPI,T3_US_Cornell,T3_UK_London_UCL,T3_US_UCD,T3_CO_Uniandes,T3_KR_KNU,T2_FR_IPHC,T3_US_OSU,T3_GR_IASA_GR,T3_US_TAMU,T1_US_FNAL,T3_IT_Trieste,T2_IT_Rome,T2_UK_London_Brunel,T3_IN_PUHEP,T3_US_OSG,T2_US_Vanderbilt,T2_EE_Estonia,T2_IN_TIFR,T2_CN_Beijing,T1_RU_JINR_MSS,T2_US_Florida,T3_CH_CERN_HelixNebula,T3_GR_IASA_HG,T3_US_TACC,T3_TW_NTU_HEP,T3_US_Rutgers,T1_DE_KIT,T2_US_Wisconsin,T2_HU_Budapest,T2_DE_RWTH,T3_US_Omaha,T3_IT_Perugia,T3_UK_SGrid_Oxford,T3_US_Princeton_ICSE,T3_US_NU,T3_US_UMiss,T2_BR_UERJ,T3_MX_Cinvestav,T3_US_FNALLPC,T1_ES_PIC_MSS,T3_RU_FIAN,T0_CH_CERN_MSS,T3_CH_CERN_DOMA,T2_ES_IFCA,T3_US_UCSB,T3_US_NERSC,T3_ES_Oviedo,T3_US_NotreDame,T2_DE_DESY,T1_UK_RAL,T3_HU_Debrecen,T2_US_Caltech,T3_FR_IPNL,T0_CH_CSCS_HPC,T3_BG_UNI_SOFIA,T3_UK_London_RHUL,T1_UK_RAL_MSS,T3_IN_TIFRCloud,T0_CH_CERN,T1_RU_JINR_Disk,T3_CN_PKU,T3_US_Baylor,T2_US_Nebraska,T2_ES_CIEMAT,T1_FR_CCIN2P3_Disk,T2_KR_KISTI,T3_US_FSU,T3_KR_UOS,T2_BR_SPRACE,T1_ES_PIC_Disk,T1_IT_CNAF,T2_CH_CERNBOX,T1_FR_CCIN2P3_MSS,T2_TR_METU,T2_AT_Vienna,T2_US_Purdue,T3_US_Rice,T2_TW_NCHC,T2_US_MIT,T2_BE_UCL,NoReportedSite,null,T2_UK_SGrid_Bristol,T2_PT_NCG_Lisbon,T1_ES_PIC,T3_US_JHU,T2_IT_Legnaro,T3_CH_CERN_HelixNebula_REHA,T1_DE_KIT_Disk,T2_RU_INR,T3_US_SDSC,T3_US_FIU,T2_RU_JINR,T3_US_FIT,T2_IT_Pisa,T2_GR_Ioannina,T3_UK_ScotGrid_GLA,T3_US_MIT,T2_CH_CERN_HLT,T2_MY_UPM_BIRUNI,T0_CH_CERN_Export,T1_FR_CCIN2P3,T2_FR_GRIF_IRFU,T3_US_NERSC.1,T2_FR_CCIN2P3,T2_PL_Warsaw,T3_CH_Volunteer,T3_US_TTU,T1_DE_KIT_MSS,T2_BE_IIHE,T2_RU_ITEP,T2_PL_Swierk,T3_CH_PSI,T3_IT_Bologna
-1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11003,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
132,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
134,0,0,0,0,"[2, -0.743079271049, -0.229273128132, -0.33313...",0,"[4, -0.778805975849, -0.229366872954, -0.34884...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[1, -0.813795983991, 0.398696258056, 0.2636869...",0,0,0,0,"[3, -0.75917549747, 0.385182945295, 0.20744927...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[2, -0.739731039565, 0.472250420695, 0.1748415...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
137,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## 4. Flatten the matrices

In [71]:
def flatten(x):
    flat_matrix = []

    for column in x: # 60 columns (i.e. sites)
        flat_site = []
        for item in x[column]:

            if isinstance(item, (list,)) == False:
                dummy = [item] + [0] * (dim_w2v)
                #if item != 0:
                #    print dummy
            flat_site = flat_site + dummy
        flat_matrix.append(flat_site)
    return flat_matrix

In [72]:
# Flatten good/bad site features
test['table_flattened'] = test['table'].apply(lambda x: flatten(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [81]:
input_ml = test[['task_name', 'table_flattened', 'splitting', 'splitting_encoded', 'action', 'action_encoded', 'target_label', 'target_encoded', 'action_label']]

In [82]:
input_ml.head()

Unnamed: 0,task_name,table_flattened,splitting,splitting_encoded,action,action_encoded,target_label,target_encoded,action_label
0,/amaltaro_Run2018A-v1-DoubleMuon-17Sep2018_102...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1x,2,acdc,0,acdc_1x,2,acdc
1,/amaltaro_Run2018A-v1-DoubleMuon-17Sep2018_102...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1x,2,acdc,0,acdc_1x,2,acdc
2,/amaltaro_Run2018A-v1-DoubleMuon-17Sep2018_102...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1x,2,acdc,0,acdc_1x,2,acdc
3,/amaltaro_Run2018A-v1-DoubleMuon-17Sep2018_102...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1x,2,acdc,0,acdc_1x,2,acdc
4,/amaltaro_Run2018A-v1-DoubleMuon-17Sep2018_102...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1x,2,acdc,0,acdc_1x,2,acdc


In [83]:
input_ml.to_hdf('/bigdata/shared/AIErrorHandling/test.h5', 'test')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['task_name', 'table_flattened', 'splitting', 'action', 'target_label', 'action_label']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
