In [1]:
subject = '''
ID: 1
Name: X
FamilyN: Y
Age: 20

ID: 2
Name: H
FamilyN: F
Age: 23

ID: 3
Name: S
FamilyN: Y
Age: 13

ID: 4
Name: M
FamilyN: Z
Age: 25'''

In [3]:
import re
result = re.findall(
    r"""(?mx)           # multiline, verbose regex
    ^ID:.*\s*           # Match ID: and anything else on that line 
    Name:\s*(.*)\s*     # Match name, capture all characters on this line
    FamilyN:\s*(.*)\s*  # etc. for family name
    Age:\s*(.*)$        # and age""", 
    subject)
print result

[('X', 'Y', '20'), ('H', 'F', '23'), ('S', 'Y', '13'), ('M', 'Z', '25')]


1. [extract numbers from mixed log file](http://stackoverflow.com/questions/32480483/extract-critical-numbers-from-a-mixed-log-file)


I have a log file contained many slices like this:

    Align set A and merge into set B ...
        setA, 4 images , image size 146 X 131
        setA, image 1, shape center shift (7, -9) compared to image center
        setA, image 2, shape center shift (8, -10) compared to image center
        setA, image 3, shape center shift (6, -9) compared to image center
        setA, image 4, shape center shift (6, -8) compared to image center
        final set B, image size 143 X 129
    Write set B ...

Now, I want to extract the numbers in this slice into a table:

| width_A | height_A | shift_x | shift_y | width_B | height_B|
--- | --- | --- | ----| ---
A1 | 146 | 131 | 7 | -9 | 143 | 129
A2 | 146 | 131 | 8 | -10 | 143 | 129
A3 | 146 | 131 | 6 | -9 | 143 | 129
A4 | 146 | 131 | 6 | -8 | 143 | 129

| width_A | height_A | shift_x1 | shift_y1 | shift_x2 | shift_y2 | shift_x3 | shift_y3 | shift_x4 | shift_y4 | width_B | height_B|
--- | --- | --- | ----| --- | --- | --- | ----| ---| --- | --- | ----|


If dividing the procedure into two parts, then:

1. text processing, read the text into a dictionary `data`, e.g., `data['A1']['shift_x'] = 7`. 
2. use pandas convert the dictionary into dataframe: `df = pd.DataFrame(data)`


But I am not familiar with python text processing:

 - Different from [Python: How to loop through blocks of lines](http://stackoverflow.com/questions/3914454/python-how-to-loop-through-blocks-of-lines), my log text are not so well organised; 
 - regular expression may be a choice, but I can never remember the tricks to classify all kinds of symbols
 
Does anyone have a good solution for this? Python is preferred. Thanks in advance.

In [10]:
text = '''
gauge 1, BWall_01BW90N01920_01720_01320_00440_00960_00860_01320_00440_1_1, processing...
Merge set A into set B ...
    setA, 4 images , image size 321 X 291
    setA, image 1, shape center shift (15, -17) compared to image center
    setA, image 2, shape center shift (1, -17) compared to image center
    setA, image 3, shape center shift (13, -17) compared to image center
    setA, image 4, shape center shift (14, -17) compared to image center
    final set B, image size 319 X 263
Write gauge 1, set B ...
'''

# real example 2
import re
import os
import pandas as pd

# store attribute as a turple, construct a dictionary, turple_attribute: pattern
regexp = {
    ('gauge_no', ): re.compile('gauge (\d{1,}), BWall_.*' ),
    ('height_A', 'width_A'): re.compile('\s+setA, \d{1,} images , image size (\d{1,}) X (\d{1,}).*'),
    ('image_no', 'shift_x', 'shift_y'): re.compile('\s+setA, image (\d{1,}), shape center shift \((-?\d{1,}), (-?\d{1,})\) compared to image center.*'),
    ('height_B', 'width_B'): re.compile('\s+final set B, image size (\d{1,}) X (\d{1,})')} # ('gauge_no', ): re.compile(r'Write gauge (\d{1,}), set B.*')
    
#print(log_file)
dict_summary = {}
f = text.split('\n')
for line in f:   
    print line
    for keys, pattern in regexp.iteritems():
        m = pattern.match(line)
        if m:          
            # traverse attributes
            for groupn, attr in enumerate(keys):  
                # print attr 
                if attr == 'gauge_no':
                    gauge_no = 'gauge' + str(m.group(groupn+1))
                    dict_summary[gauge_no] = {}
                elif attr == 'image_no':
                    image_no = m.group(groupn+1)
                elif (attr == 'shift_x') or (attr == 'shift_y'):
                    key = attr + str(image_no)
                    dict_summary[gauge_no][key] = int(m.group(groupn+1))
                else: # 'height_A', 'width_A', 'height_B', 'width_B'
                    dict_summary[gauge_no][attr] = int(m.group(groupn+1))
print dict_summary
df = pd.DataFrame(dict_summary)
df = df.transpose()
df['width_diff'] = df.apply(lambda x: x['width_A'] - x['width_B'], axis = 1)
df['height_diff'] = df.apply(lambda x: x['height_A'] - x['height_B'], axis = 1)
#df.to_csv(os.path.join(workpath, text_path, text_name+'_summary.csv'), sep = ',', na_rep='NaN')
print df.head(1)


gauge 1, BWall_01BW90N01920_01720_01320_00440_00960_00860_01320_00440_1_1, processing...
gauge_no
Merge set A into set B ...
    setA, 4 images , image size 321 X 291
height_A
width_A
    setA, image 1, shape center shift (15, -17) compared to image center
image_no
shift_x
shift_y
    setA, image 2, shape center shift (1, -17) compared to image center
image_no
shift_x
shift_y
    setA, image 3, shape center shift (13, -17) compared to image center
image_no
shift_x
shift_y
    setA, image 4, shape center shift (14, -17) compared to image center
image_no
shift_x
shift_y
    final set B, image size 319 X 263
height_B
width_B
Write gauge 1, set B ...

{'gauge1': {'shift_x3': 13, 'shift_y1': -17, 'shift_y2': -17, 'shift_y3': -17, 'shift_y4': -17, 'height_A': 321, 'height_B': 319, 'shift_x2': 1, 'shift_x4': 14, 'width_B': 263, 'shift_x1': 15, 'width_A': 291}}
        height_A  height_B  shift_x1  shift_x2  shift_x3  shift_x4  shift_y1  \
gauge1       321       319        15         1       