```C++
struct TAMessage {
    NasdaqSipHeader sipHeader;
    uint64_t        timestamp2;
    char            symbol[5];
    uint64_t        tradeId;
    uint16_t        price;
    uint16_t        volume;
    char            cond[4];
    char            tradeThrExempt;
    char            consPriceChangeInd;
    char            partPriceChangeInd;
};
```

```C++
class MessageA {
public:
    int         getA() const;
    float       getB() const;
    float       getC() const;
    float       getD() const;
    SecretClass getE() const;
    void superSecretDeserialize(ostream&);
};
```

Types.h
```C++
typedef float MyPrice;
struct Prices {
private:
    float low_;
    MyPrice high_;
public:
    Prices(float low = 0,MyPrice high = MyPrice()) : 
        low_(low), 
        high_(high) {}
    float getLow() const { return low_; }
    MyPrice getHigh() const { return high_; }
};
```

Messages.h
```C++
#include "Types.h"
class MessageA {
private:
    int a_;
    float b_;
public:
    MessageA(int a = 0,float b = 0): a_(a), b_(b) {}
    int getA() const { return a_; }
    float getB() const { return b_; }
};
class MessageB {
private:
    Prices prices_;
    MyPrice price_;
public:
    MessageB(Prices prices = Prices(), MyPrice price = MyPrice()): 
        prices_(prices),
        price_(price) {}
    Prices getPrices() const { return prices_; }
    MyPrice getPrice() const { return price_; }
};
```

```C++
class MessageA {
private:
    int a_;
    float b_;
public:
    MessageA(int a = 0,float b = 0): a_(a), b_(b) {}
    int getA() const { return a_; }
    float getB() const { return b_; }
};
```

```C++
class MessageB {
private:
    Prices prices_;
    MyPrice price_;
public:
    MessageB(Prices prices = Prices(), MyPrice price = MyPrice()): 
        prices_(prices),
        price_(price) {}
    Prices getPrices() const { return prices_; }
    MyPrice getPrice() const { return price_; }
};
```

In [1]:
from clang import cindex
from pprint import pprint
import json

In [2]:
index = cindex.Index.create()
tu = index.parse('Message.h',args=['-x','c++','-I.'])

In [3]:
def find_ast(ast, cond):
    """parse the AST and yield all cursors that meet condition"""
    if cond(ast):
        yield ast
    for c in ast.get_children():
        it = find_ast(c,cond)
        if it:
            for i in it:
                yield i

In [4]:
filter_class = lambda x: x.kind == cindex.CursorKind.CLASS_DECL and x.displayname .startswith('Message')
msgs = [m for m in find_ast(tu.cursor,filter_class)]

In [5]:
def parse_ast(cursor, recursive = True):
    """convert cursor to py obj"""
    res = {
        'displayname':cursor.displayname, 
        'kind':str(cursor.kind),
        'spelling':str(cursor.spelling),
        'access_specifier':str(cursor.access_specifier),
        'type':cursor.type.spelling,
        'is_pod' : str(cursor.type.is_pod()),
        'result_type':cursor.result_type.spelling,
        'result_type.kind' : str(cursor.result_type.kind),
        'result_type.is_pod' : str(cursor.result_type.is_pod()),
    }
    if recursive:
        res['children'] = [parse_ast(c) for c in cursor.get_children()]
    return res

In [6]:
pprint(parse_ast(msgs[0]),indent=0,width=20,compact=True,depth=2)

{'access_specifier': 'AccessSpecifier.INVALID',
'children': [{...},
            {...},
            {...},
            {...},
            {...},
            {...},
            {...}],
'displayname': 'MessageA',
'is_pod': 'False',
'kind': 'CursorKind.CLASS_DECL',
'result_type': '',
'result_type.is_pod': 'False',
'result_type.kind': 'TypeKind.INVALID',
'spelling': 'MessageA',
'type': 'MessageA'}


In [7]:
pprint(parse_ast(msgs[0])['children'][2],indent=0,width=20,compact=True,depth=2)

{'access_specifier': 'AccessSpecifier.PRIVATE',
'children': [],
'displayname': 'b_',
'is_pod': 'True',
'kind': 'CursorKind.FIELD_DECL',
'result_type': '',
'result_type.is_pod': 'False',
'result_type.kind': 'TypeKind.INVALID',
'spelling': 'b_',
'type': 'float'}


In [8]:
pprint(parse_ast(msgs[0])['children'][6],indent=0,width=20,compact=True,depth=2)

{'access_specifier': 'AccessSpecifier.PUBLIC',
'children': [{...}],
'displayname': 'getB()',
'is_pod': 'False',
'kind': 'CursorKind.CXX_METHOD',
'result_type': 'float',
'result_type.is_pod': 'True',
'result_type.kind': 'TypeKind.FLOAT',
'spelling': 'getB',
'type': 'float () '
        'const'}


In [9]:
pprint(parse_ast(msgs[1])['children'][1],indent=0,width=20,compact=True,depth=2)

{'access_specifier': 'AccessSpecifier.PRIVATE',
'children': [{...}],
'displayname': 'prices_',
'is_pod': 'False',
'kind': 'CursorKind.FIELD_DECL',
'result_type': '',
'result_type.is_pod': 'False',
'result_type.kind': 'TypeKind.INVALID',
'spelling': 'prices_',
'type': 'Prices'}


In [10]:
pprint(parse_ast(msgs[1])['children'][5],indent=0,width=20,compact=True,depth=2)

{'access_specifier': 'AccessSpecifier.PUBLIC',
'children': [{...},
            {...}],
'displayname': 'getPrices()',
'is_pod': 'False',
'kind': 'CursorKind.CXX_METHOD',
'result_type': 'Prices',
'result_type.is_pod': 'False',
'result_type.kind': 'TypeKind.RECORD',
'spelling': 'getPrices',
'type': 'Prices () '
        'const'}


In [11]:
def gen_types_schema(types_schema, pods, types, is_message = False):
    """ top level function to generate the internal schema. Finds all getters. Breaks down complex types
        types_schema - output - stores the final schema
        pods - output - plain old data types seen in code
        types - input - message types to generate schema for"""
    filter_getters = lambda x: x.kind == cindex.CursorKind.CXX_METHOD and x.spelling.startswith('get')
    for t in types:
        getters = [g for g in find_ast(t,filter_getters)]
        types_schema[t.spelling] = {}
        types_schema[t.spelling]['getters'] = [getter_schema(g) for g in getters]
        types_schema[t.spelling]['is_message'] = is_message
        for x in getters:
            break_down_type(types_schema, pods, x.result_type)

In [12]:
def getter_schema(cursor):
    """ returns schema for a getter"""
    res = {
        'displayname':cursor.displayname, 
        'spelling':str(cursor.spelling),
        'name':str(cursor.spelling.replace('get','')),
        'type':cursor.type.spelling,
        'result_type':cursor.result_type.spelling
    }
    return res

In [13]:
def is_pod(cursor):
    """ checks if type is plain old data (pod)"""
    return cursor.is_pod()
def break_down_type(types_schema, pods,x):
    """ generates schema for complex types, saves pods, will unwind typedefs"""
    if is_pod(x):
        pods[x.spelling] = x
    else:
        if x.get_declaration().kind == cindex.CursorKind.TYPEDEF_DECL:
            break_down_type(types_schema, pods, x.get_declaration().underlying_typedef_type)
        else:
            gen_types_schema(types_schema, pods, [x.get_declaration()])

In [14]:
types_schema = {}
pods = {}
gen_types_schema(types_schema,pods,msgs, is_message=True)
pprint(types_schema)

{'MessageA': {'getters': [{'displayname': 'getA()',
                           'name': 'A',
                           'result_type': 'int',
                           'spelling': 'getA',
                           'type': 'int () const'},
                          {'displayname': 'getB()',
                           'name': 'B',
                           'result_type': 'float',
                           'spelling': 'getB',
                           'type': 'float () const'}],
              'is_message': True},
 'MessageB': {'getters': [{'displayname': 'getPrices()',
                           'name': 'Prices',
                           'result_type': 'Prices',
                           'spelling': 'getPrices',
                           'type': 'Prices () const'},
                          {'displayname': 'getPrice()',
                           'name': 'Price',
                           'result_type': 'MyPrice',
                           'spelling': 'getPrice',
               

```jinja2
#include <iostream>
#include <arrow/api.h>
#include <arrow/io/api.h>
#include <parquet/arrow/reader.h>
#include <parquet/arrow/writer.h>
#include <parquet/exception.h>

class Converter {
private:
    std::vector<std::shared_ptr<arrow::Field>> schema_;
    std::vector<std::shared_ptr<arrow::ArrayBuilder>> data_;
    std::size_t off_;
public:
    Converter():off_(0) {}
    void beginMessage() {
        off_ = 0;
    }
    void endMessage() {
        off_ = 0;
    }
    void convert(const float& val, const char* name = "") {
        if (off_ + 1 > data_.size()) {
            schema_.push_back(arrow::field(name, arrow::float32()));
            data_.push_back(std::make_shared<arrow::FloatBuilder>());
        }
        std::dynamic_pointer_cast<arrow::FloatBuilder>(data_[off_])->Append(val);
        off_++;
    }
    void convert(const int& val, const char* name = "") {
        if (off_ + 1 > data_.size()) {
            schema_.push_back(arrow::field(name, arrow::int32()));
            data_.push_back(std::make_shared<arrow::Int32Builder>());
        }
        std::dynamic_pointer_cast<arrow::Int32Builder>(data_[off_])->Append(val);
        off_++;
    }
```

```jinja2
{%- for name,msg in types.items() %}
void convert(const {{name}}& msg, const char* name = "") {
    {% if msg.is_message %}
    beginMessage();
    {% endif %}
{%- for getter in msg.getters %}
    {
        auto res = msg.{{getter['displayname']}};
        convert(res,"{{getter['name']}}");
    }
{%- endfor %}
    {% if msg.is_message %}
    endMessage();
    {% endif %}
}
{%- endfor %}
```

```jinja2
    void write_parquet_file(const char* filename) {
        std::vector<std::shared_ptr<arrow::Array>> arrays;
        for (auto& d:data_) {
            std::shared_ptr<arrow::Array> arr;
            PARQUET_THROW_NOT_OK(d->Finish(&arr));
            arrays.push_back(arr);
        }
        std::shared_ptr<arrow::Schema> schema = arrow::schema(schema_);
        std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, arrays);
        write_parquet_file(filename,*table);
    }
    
    void write_parquet_file(const char* filename,const arrow::Table& table) {
      std::shared_ptr<arrow::io::FileOutputStream> outfile;
      PARQUET_THROW_NOT_OK(
          arrow::io::FileOutputStream::Open(filename, &outfile));
      // The last argument to the function call is the size of the RowGroup in
      // the parquet file. Normally you would choose this to be rather large but
      // for the example, we use a small value to have multiple RowGroups.
      PARQUET_THROW_NOT_OK(
          parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
    }
};
```

In [15]:
template = """
#pragma once
#include <iostream>
#include <arrow/api.h>
#include <arrow/io/api.h>
#include <parquet/arrow/reader.h>
#include <parquet/arrow/writer.h>
#include <parquet/exception.h>

class Converter {
private:
    std::vector<std::shared_ptr<arrow::Field>> schema_;
    std::vector<std::shared_ptr<arrow::ArrayBuilder>> data_;
    std::size_t off_;
public:
    Converter():off_(0) {}
    void beginMessage() {
        off_ = 0;
    }
    void endMessage() {
        off_ = 0;
    }
    void convert(const float& val, const char* name = "") {
        if (off_ + 1 > data_.size()) {
            schema_.push_back(arrow::field(name, arrow::float32()));
            data_.push_back(std::make_shared<arrow::FloatBuilder>());
        }
        std::dynamic_pointer_cast<arrow::FloatBuilder>(data_[off_])->Append(val);
        off_++;
    }
    void convert(const int& val, const char* name = "") {
        if (off_ + 1 > data_.size()) {
            schema_.push_back(arrow::field(name, arrow::int32()));
            data_.push_back(std::make_shared<arrow::Int32Builder>());
        }
        std::dynamic_pointer_cast<arrow::Int32Builder>(data_[off_])->Append(val);
        off_++;
    }
    {%- for name,msg in types.items() %}
    void convert(const {{name}}& msg, const char* name = "") {
        {% if msg.is_message %}
        beginMessage();
        {% endif %}
    {%- for getter in msg.getters %}
        {
            auto res = msg.{{getter['displayname']}};
            convert(res,"{{getter['name']}}");
        }
    {%- endfor %}
        {% if msg.is_message %}
        endMessage();
        {% endif %}
    }
    {%- endfor %}
    void write_parquet_file(const char* filename) {
        std::vector<std::shared_ptr<arrow::Array>> arrays;
        for (auto& d:data_) {
            std::shared_ptr<arrow::Array> arr;
            PARQUET_THROW_NOT_OK(d->Finish(&arr));
            arrays.push_back(arr);
        }
        std::shared_ptr<arrow::Schema> schema = arrow::schema(schema_);
        std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, arrays);
        write_parquet_file(filename,*table);
    }
    
    void write_parquet_file(const char* filename,const arrow::Table& table) {
      std::shared_ptr<arrow::io::FileOutputStream> outfile;
      PARQUET_THROW_NOT_OK(
          arrow::io::FileOutputStream::Open(filename, &outfile));
      // The last argument to the function call is the size of the RowGroup in
      // the parquet file. Normally you would choose this to be rather large but
      // for the example, we use a small value to have multiple RowGroups.
      PARQUET_THROW_NOT_OK(
          parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
    }
};
"""

In [16]:
from jinja2 import Template, Environment, FileSystemLoader
import os
t = Template(template)
context = {'types':types_schema}
filecontent = Environment(loader=FileSystemLoader('.'),lstrip_blocks=True).from_string(template).render(context)
with open('Converter.h','w') as f:
    f.write(filecontent)

```C++
#pragma once
#include <iostream>
#include <arrow/api.h>
#include <arrow/io/api.h>
#include <parquet/arrow/reader.h>
#include <parquet/arrow/writer.h>
#include <parquet/exception.h>

class Converter {
private:
    std::vector<std::shared_ptr<arrow::Field>> schema_;
    std::vector<std::shared_ptr<arrow::ArrayBuilder>> data_;
    std::size_t off_;
public:
    Converter():off_(0) {}
    void beginMessage() {
        off_ = 0;
    }
    void endMessage() {
        off_ = 0;
    }
    void convert(const float& val, const char* name = "") {
        if (off_ + 1 > data_.size()) {
            schema_.push_back(arrow::field(name, arrow::float32()));
            data_.push_back(std::make_shared<arrow::FloatBuilder>());
        }
        std::dynamic_pointer_cast<arrow::FloatBuilder>(data_[off_])->Append(val);
        off_++;
    }
    void convert(const int& val, const char* name = "") {
        if (off_ + 1 > data_.size()) {
            schema_.push_back(arrow::field(name, arrow::int32()));
            data_.push_back(std::make_shared<arrow::Int32Builder>());
        }
        std::dynamic_pointer_cast<arrow::Int32Builder>(data_[off_])->Append(val);
        off_++;
    }
```

```C++
    void convert(const MessageA& msg, const char* name = "") {
        beginMessage();
        {
            auto res = msg.getA();
            convert(res,"A");
        }
        {
            auto res = msg.getB();
            convert(res,"B");
        }
        endMessage();
    }
    void convert(const MessageB& msg, const char* name = "") {
        beginMessage();
        {
            auto res = msg.getPrices();
            convert(res,"Prices");
        }
        {
            auto res = msg.getPrice();
            convert(res,"Price");
        }
        endMessage();
    }
    void convert(const Prices& msg, const char* name = "") {
        {
            auto res = msg.getLow();
            convert(res,"Low");
        }
        {
            auto res = msg.getHigh();
            convert(res,"High");
        }
    }
```

```C++
    void write_parquet_file(const char* filename) {
        std::vector<std::shared_ptr<arrow::Array>> arrays;
        for (auto& d:data_) {
            std::shared_ptr<arrow::Array> arr;
            PARQUET_THROW_NOT_OK(d->Finish(&arr));
            arrays.push_back(arr);
        }
        std::shared_ptr<arrow::Schema> schema = arrow::schema(schema_);
        std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, arrays);
        write_parquet_file(filename,*table);
    }
    
    void write_parquet_file(const char* filename,const arrow::Table& table) {
      std::shared_ptr<arrow::io::FileOutputStream> outfile;
      PARQUET_THROW_NOT_OK(
          arrow::io::FileOutputStream::Open(filename, &outfile));
      // The last argument to the function call is the size of the RowGroup in
      // the parquet file. Normally you would choose this to be rather large but
      // for the example, we use a small value to have multiple RowGroups.
      PARQUET_THROW_NOT_OK(
          parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
    }
};
```

```jinja2
class Converters {
private:
{%- for name,msg in types.items() %}
{% if msg.is_message %}
    Converter converter{{name}}_;
{% endif %}
{%- endfor %}
public:
{%- for name,msg in types.items() %}
{% if msg.is_message %}
    void convert(const {{name}}& msg) {
        converter{{name}}_.convert(msg);
    }
{% endif %}
{%- endfor %}
    void write_parquet_files() {
{%- for name,msg in types.items() %}
{% if msg.is_message %}
    converter{{name}}_.write_parquet_file("{{name}}.parquet");
{% endif %}
{%- endfor %}
    }
};
```

In [17]:
template = """
#pragma once
#include "Message.h"
#include "Converter.h"
class Converters {
private:
{%- for name,msg in types.items() %}
{% if msg.is_message %}
    Converter converter{{name}}_;
{% endif %}
{%- endfor %}
public:
{%- for name,msg in types.items() %}
{% if msg.is_message %}
    void convert(const {{name}}& msg) {
        converter{{name}}_.convert(msg);
    }
{% endif %}
{%- endfor %}
    void write_parquet_files() {
{%- for name,msg in types.items() %}
{% if msg.is_message %}
        converter{{name}}_.write_parquet_file("{{name}}.parquet");
{% endif %}
{%- endfor %}
    }
};
"""

In [18]:
t = Template(template)
context = {'types':types_schema}
filecontent = Environment(loader=FileSystemLoader('.'),lstrip_blocks=True).from_string(template).render(context)
with open('Converters.h','w') as f:
    f.write(filecontent)

```C++
#pragma once
#include "Message.h"
#include "Converter.h"
class Converters {
private:
    Converter converterMessageA_;
    Converter converterMessageB_;
public:
    void convert(const MessageA& msg) {
        converterMessageA_.convert(msg);
    }
    void convert(const MessageB& msg) {
        converterMessageB_.convert(msg);
    }
    void write_parquet_files() {
        converterMessageA_.write_parquet_file(
            "MessageA.parquet");
        converterMessageB_.write_parquet_file(
            "MessageB.parquet");
    }
};
```

main.cpp
```C++
#include "Message.h"
#include "Converter.h"
#include "Converters.h"
int main(int argc, char** argv) {
    Converters converter;
    for (int i=0;i<1000;i++) {
        converter.convert(MessageA(i,i*10));
    }
    for (int i=0;i<134;i++) {
        converter.convert(MessageB({float(i)-2,float(i)+4},i));
    }
    converter.write_parquet_files();
    return EXIT_SUCCESS;
}
```

In [19]:
!/usr/bin/clang++-7 -L/usr/lib/x86_64-linux-gnu -larrow -lparquet main.cpp

In file included from main.cpp:2:
      'warn_unused_result' attribute [-Wunused-result][0m
        std::dynamic_pointer_cast<arrow::FloatBuilder>(data_[off_])->Append(val);
[0;1;32m        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~
      'warn_unused_result' attribute [-Wunused-result][0m
        std::dynamic_pointer_cast<arrow::Int32Builder>(data_[off_])->Append(val);
[0;1;32m        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~


In [20]:
!./a.out

In [21]:
import pyarrow as pa
import pyarrow.parquet as pq
from IPython.display import display
display(pq.ParquetFile('MessageA.parquet').metadata)
display(pq.ParquetFile('MessageB.parquet').metadata)

<pyarrow._parquet.FileMetaData object at 0x7fcf4c0c65e8>
  created_by: parquet-cpp version 1.5.1-SNAPSHOT
  num_columns: 2
  num_rows: 1000
  num_row_groups: 334
  format_version: 1.0
  serialized_size: 45333

<pyarrow._parquet.FileMetaData object at 0x7fcf4c0c65e8>
  created_by: parquet-cpp version 1.5.1-SNAPSHOT
  num_columns: 3
  num_rows: 134
  num_row_groups: 45
  format_version: 1.0
  serialized_size: 9337

In [22]:
display(pq.ParquetFile('MessageA.parquet').schema)
display(pq.ParquetFile('MessageB.parquet').schema)

<pyarrow._parquet.ParquetSchema object at 0x7fcf0de451d0>
A: INT32
B: FLOAT
 

<pyarrow._parquet.ParquetSchema object at 0x7fcf0de452b0>
Low: FLOAT
High: FLOAT
Price: FLOAT
 

In [23]:
dfa = pq.read_table('MessageA.parquet').to_pandas()
dfb = pq.read_table('MessageB.parquet').to_pandas()
display(dfa.head())
display(dfb.head())

Unnamed: 0,A,B
0,0,0.0
1,1,10.0
2,2,20.0
3,3,30.0
4,4,40.0


Unnamed: 0,Low,High,Price
0,-2.0,4.0,0.0
1,-1.0,5.0,1.0
2,0.0,6.0,2.0
3,1.0,7.0,3.0
4,2.0,8.0,4.0
