In [18]:
import pandas as pd
import random
import numpy as np

### ipm_provinsi_indonesia

In [62]:
# Generate meta
def meta_gen_ipm_provinsi_indonesia(ds, l0_prob = 0.99, l1_prob = 0.5, annotator = 'Zaki', id = 'ds0', tablename = 'default', verbose = 0):
    questions = []
    answers = []
    positions = []
    
    pos = 0
    for prov in ds.index:
      if random.uniform(0,1) > l0_prob:
         if verbose > 0:
            print('Not sampled:', prov)
         continue
      #=========== Generate level 0 question-answer set ===========#
      # generate the IPM value since 2010 as a list
      res = []
      for i in range(13):
        res.append(ds.loc[prov,str(i+2010)])

      # randomly select one of these question sentences
      rand = random.uniform(0,1)
      if rand < 0.3:
        qu = 'what is the IPM of '+prov+' since 2010?'
      elif rand < 0.7:
        qu = 'from 2010 what is the IPM of '+prov+'?'
      else:
        qu = 'what is the IPM of '+prov+'?'
      
      # Save result, append 
      questions.append(qu)
      answers.append(res)
      positions.append(pos)

      pos = 1
      for year in ds.columns:
        if random.uniform(0,1) > l1_prob:
           if verbose > 1:
              print('Not sampled:', prov, 'on', year)
           continue
        #=========== Generate level 1 question-answer set ===========#
        
        # randomly select one of these question sentences
        rand = random.uniform(0,1)
        if rand < 0.3:
            qu = 'what is the IPM in '+year+'?'
            res = [str(ds.loc[prov,str(year)])]
        elif rand < 0.5:
            qu = 'what is the IPM of '+prov+' in '+year+'?'
            res = [str(ds.loc[prov,str(year)])]
        elif rand < 0.8:
            qu = 'the IPM in '+year+'?'
            res = [str(ds.loc[prov,str(year)])]
        elif rand < 0.9:
           # Special case, highest value
           qu = 'the highest IPM in '+prov+'?'
           res = [np.max(ds.loc[prov,:])]
        else:
          # Special case, lowest value
           qu = 'the lowest IPM in '+prov+'?'
           res = [np.min(ds.loc[prov,:])]
           
        # Save result, append
        questions.append(qu)
        answers.append(res)
        positions.append(pos)
    result = pd.DataFrame({
       'id' : id,
       'annotator' : annotator,
       'tablename' : tablename,
       'position' : positions,
       'question' : questions,
       'answer' : answers
    })
    return(result)

tablename = 'ipm_provinsi_indonesia'
ds = pd.read_csv('table_csv/ipm_provinsi_indonesia.csv', sep = ';')
ds.set_index('Provinsi', inplace = True)
ds = ds.astype('str')

result = meta_gen_ipm_provinsi_indonesia(ds = ds, l0_prob = 0.5, verbose = 2, tablename=tablename, id = 'ds0001')
fileloc = 'meta_csv/meta_'+tablename+'.csv'
result.to_csv(fileloc, sep = ';', index = False)
print('Total question-answer:', result.shape[0])

Not sampled: ACEH
Not sampled: SUMATERA UTARA on 2022
Not sampled: SUMATERA UTARA on 2021
Not sampled: SUMATERA UTARA on 2020
Not sampled: SUMATERA UTARA on 2018
Not sampled: SUMATERA UTARA on 2015
Not sampled: SUMATERA UTARA on 2012
Not sampled: SUMATERA UTARA on 2011
Not sampled: SUMATERA UTARA on 2010
Not sampled: SUMATERA BARAT on 2019
Not sampled: SUMATERA BARAT on 2018
Not sampled: SUMATERA BARAT on 2017
Not sampled: SUMATERA BARAT on 2012
Not sampled: SUMATERA BARAT on 2010
Not sampled: RIAU on 2022
Not sampled: RIAU on 2020
Not sampled: RIAU on 2017
Not sampled: RIAU on 2015
Not sampled: RIAU on 2014
Not sampled: RIAU on 2013
Not sampled: RIAU on 2012
Not sampled: RIAU on 2011
Not sampled: JAMBI on 2022
Not sampled: JAMBI on 2021
Not sampled: JAMBI on 2020
Not sampled: JAMBI on 2019
Not sampled: JAMBI on 2017
Not sampled: JAMBI on 2015
Not sampled: JAMBI on 2014
Not sampled: JAMBI on 2013
Not sampled: JAMBI on 2011
Not sampled: JAMBI on 2010
Not sampled: SUMATERA SELATAN on 202

### inflasi_kota_indonesia

In [63]:
# Generate meta
def meta_gen_inflasi_kota_indonesia(ds, l0_prob = 0.99, l1_prob = 0.5, annotator = 'Zaki', id = 'ds0', tablename = 'default', verbose = 0):
    questions = []
    answers = []
    positions = []
    
    pos = 0
    for idx in ds.index:
      if random.uniform(0,1) > l0_prob:
         if verbose > 0:
            print('Not sampled:', idx)
         continue
      #=========== Generate level 0 question-answer set ===========#
      # generate the inflation rate value since 2020 as a list
      res = []
      for i in range(ds.shape[1]):
        res.append(ds.loc[idx,str(i+2020)])

      # randomly select one of these question sentences
      rand = random.uniform(0,1)
      if rand < 0.3:
        qu = 'what is the inflation rate of '+idx+' since 2020?'
      elif rand < 0.5:
        qu = 'from 2020 what is the inflation number of '+idx+'?'
      elif rand < 0.7:
        qu = 'since 2020 what is the inflation rate of '+idx+'?'
      else:
        qu = 'what is the inflation rate of '+idx+'?'
      
      # Save result, append 
      questions.append(qu)
      answers.append(res)
      positions.append(pos)

      pos = 1
      for col in ds.columns:
        if random.uniform(0,1) > l1_prob:
           if verbose > 1:
              print('Not sampled:', idx, 'on', col)
           continue
        #=========== Generate level 1 question-answer set ===========#
        res = [str(ds.loc[idx,str(col)])]
        
        # randomly select one of these question sentences
        rand = random.uniform(0,1)
        if rand < 0.3:
            qu = 'what is the inflation rate in '+col+'?'
        elif rand < 0.7:
            qu = 'what is the inflation of '+idx+' in '+col+'?'
        else:
            qu = 'the inflation rate in '+col+'?'
        
        # Save result, append
        questions.append(qu)
        answers.append(res)
        positions.append(pos)
    result = pd.DataFrame({
       'id' : id,
       'annotator' : annotator,
       'tablename' : tablename,
       'position' : positions,
       'question' : questions,
       'answer' : answers
    })
    return(result)

tablename = 'inflasi_kota_indonesia'
ds = pd.read_csv('table_csv/inflasi_kota_indonesia.csv', sep = ';')
ds.set_index('kota', inplace = True)
ds = ds.astype('str')

result = meta_gen_inflasi_kota_indonesia(ds = ds, l0_prob = 0.90, verbose = 2, tablename=tablename, id = 'ds0002')
fileloc = 'meta_csv/meta_'+tablename+'.csv'
result.to_csv(fileloc, sep = ';', index = False)
print('Total question-answer:', result.shape[0])

Not sampled: KOTA MEULABOH on 2021
Not sampled: KOTA LHOKSEUMAWE on 2022
Not sampled: KOTA LHOKSEUMAWE on 2021
Not sampled: KOTA SIBOLGA on 2021
Not sampled: KOTA SIBOLGA on 2020
Not sampled: KOTA PEMATANG SIANTAR on 2022
Not sampled: KOTA MEDAN on 2022
Not sampled: KOTA PADANGSIDIMPUAN on 2021
Not sampled: KOTA PADANGSIDIMPUAN on 2020
Not sampled: KOTA GUNUNGSITOLI on 2020
Not sampled: KOTA PADANG on 2022
Not sampled: KOTA BUKITTINGGI on 2021
Not sampled: TEMBILAHAN
Not sampled: KOTA PEKANBARU on 2022
Not sampled: KOTA PEKANBARU on 2021
Not sampled: KOTA PEKANBARU on 2020
Not sampled: KOTA DUMAI on 2022
Not sampled: BUNGO on 2022
Not sampled: BUNGO on 2020
Not sampled: KOTA JAMBI on 2022
Not sampled: KOTA JAMBI on 2021
Not sampled: KOTA PALEMBANG on 2022
Not sampled: KOTA PALEMBANG on 2020
Not sampled: KOTA LUBUKLINGGAU on 2021
Not sampled: KOTA LUBUKLINGGAU on 2020
Not sampled: KOTA BENGKULU on 2022
Not sampled: KOTA BENGKULU on 2021
Not sampled: KOTA BENGKULU on 2020
Not sampled: KO

### angka_harapan_hidup_provinsi_indonesia

In [59]:
# Generate meta
def meta_gen_ahh_provinsi_indonesia(ds, l0_prob = 0.99, l1_prob = 0.5, annotator = 'Zaki', id = 'ds0', tablename = 'default', verbose = 0):
    questions = []
    answers = []
    positions = []
    
    pos = 0
    for it, _ in enumerate(ds.index):
      idx = ds.loc[it,'Provinsi']
      if random.uniform(0,1) > l0_prob:
         if verbose > 0:
            print('Not sampled:', idx)
         continue
      #=========== Generate level 0 question-answer set ===========#
      # generate the AHH value since 2010 as a list
      res = []
      for i in range((ds.shape[1]-2)):
        res.append(ds.loc[it,str(i+2010)])

      # randomly select one of these question sentences
      rand = random.uniform(0,1)

      jk = ds.loc[it,'Jenis Kelamin']

      if rand < 0.3:
        qu = 'what is the life expectancy of '+jk+' living in '+idx+' since 2020?'
      elif rand < 0.5:
        qu = 'from 2020 what is the expected life of '+jk+' in '+idx+'?'
      elif rand < 0.7:
        qu = 'since 2020 what is the life expectancy for '+jk+' in '+idx+'?'
      else:
        qu = 'what is the '+jk+' life expectancy in '+idx+'?'
      
      # Save result, append 
      questions.append(qu)
      answers.append(res)
      positions.append(pos)

      pos = 1
      for col in ds.columns:
        if col == 'Jenis Kelamin':
           continue
        if random.uniform(0,1) > l1_prob:
           if verbose > 1:
              print('Not sampled:', idx, 'on', col)
           continue
        #=========== Generate level 1 question-answer set ===========#
        res = [str(ds.loc[it,str(col)])]
        
        # randomly select one of these question sentences
        rand = random.uniform(0,1)
        if rand < 0.3:
            qu = 'what is the life expectancy in '+col+' for '+jk+'?'
        elif rand < 0.7:
            qu = 'what is the life expectancy of '+jk+ ' in '+idx+' in '+col+'?'
        else:
            qu = jk+' life expectancy in '+col+'?'
        
        # Save result, append
        questions.append(qu)
        answers.append(res)
        positions.append(pos)
    result = pd.DataFrame({
       'id' : id,
       'annotator' : annotator,
       'tablename' : tablename,
       'position' : positions,
       'question' : questions,
       'answer' : answers
    })
    return(result)

tablename = 'angka_harapan_hidup_provinsi_indonesia'
ds = pd.read_csv('table_csv/angka_harapan_hidup_provinsi_indonesia.csv', sep = ';')
ds = ds.astype('str')

result = meta_gen_ahh_provinsi_indonesia(ds = ds, l0_prob = 0.90, verbose = 2, tablename=tablename, id = 'ds0003')
fileloc = 'meta_csv/meta_'+tablename+'.csv'
result.to_csv(fileloc, sep = ';', index = False)
print('Total question-answer:', result.shape[0])

Not sampled: ACEH on Provinsi
Not sampled: ACEH on 2022
Not sampled: ACEH on 2021
Not sampled: ACEH on 2019
Not sampled: ACEH on 2015
Not sampled: ACEH on 2014
Not sampled: ACEH on 2013
Not sampled: ACEH on 2011
Not sampled: SUMATERA UTARA
Not sampled: SUMATERA BARAT on 2020
Not sampled: SUMATERA BARAT on 2019
Not sampled: SUMATERA BARAT on 2018
Not sampled: SUMATERA BARAT on 2017
Not sampled: SUMATERA BARAT on 2016
Not sampled: SUMATERA BARAT on 2014
Not sampled: SUMATERA BARAT on 2010
Not sampled: RIAU
Not sampled: JAMBI on 2022
Not sampled: JAMBI on 2019
Not sampled: JAMBI on 2018
Not sampled: JAMBI on 2016
Not sampled: JAMBI on 2015
Not sampled: JAMBI on 2013
Not sampled: JAMBI on 2012
Not sampled: JAMBI on 2011
Not sampled: JAMBI on 2010
Not sampled: SUMATERA SELATAN on 2018
Not sampled: SUMATERA SELATAN on 2017
Not sampled: SUMATERA SELATAN on 2016
Not sampled: SUMATERA SELATAN on 2015
Not sampled: SUMATERA SELATAN on 2014
Not sampled: SUMATERA SELATAN on 2010
Not sampled: BENGKU

### check for dupes

In [56]:
ds = pd.read_csv('table_csv/angka_harapan_hidup_provinsi_indonesia.csv', sep = ';')
ds = ds.astype('str')

In [57]:
for id in ds.index:
    lis = []
    for col in ds.columns:
        if col == "Provinsi" or col =='Jenis Kelamin':
            continue
        
        
        lis.append(ds.loc[id,col])
    if len(set(lis)) < 13:
        print(ds.loc[id,'Provinsi'])