In [1]:
import pandas as pd
import io
import gzip
import re

In [2]:
path='unclean_log.log.gz'

In [3]:
df=pd.read_csv(path,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])', 
                 engine='python', na_values=['-'], header=None,
                 usecols=[0, 3, 4, 5, 6, 7, 8,10],
                 names=['ip', 'time', 'request', 'status', 'size', 'referer',
                 'user_agent','req_time'])

In [4]:
wrong_lines={}
def convert_int_feedback_index(row,col,lines):
    #function to be used with .apply
    try:
        ans = int(row[col])
    except:
        wrong_lines[row['index']] = str(lines[row['index']])[1:-3]
        ans= pd.np.nan
    return ans

def convert_log_with_feedback(df,col):
    file = gzip.open(path)
    lines = file.readlines()
    wrong_lines.clear()
    df_index = df.reset_index()
    df_index[col] = df_index.apply(convert_int_feedback_index,axis=1,col=col,lines=lines)
    if len(wrong_lines.keys()) != 0:
        print("Warning! Some lines couldn't be converted properly")
    df_index.pop('index')
    file.close()
    return df_index

In [5]:
df = convert_log_with_feedback(df,'status')



First, let's drop the lines that were wrong

In [6]:
df = df.drop(wrong_lines.keys())

The plan is to add the unusual lines to our dataframe manually.
Let's have a look at those unusual lines

In [7]:
str(wrong_lines[249242])

'\'111.22.117.229, 111.22.117.229 - - [19/Sep/2018:22:17:40 +0200] "GET /agent/10577/bdl HTTP/1.1" 204 - "-" "okhttp/3.8.0" apibackend.site.fr 429282'

They're close enough to regular lines. Hence, we should be able to get a good split with the "sep" regex we used before. Let's have a look : 

In [8]:
g=wrong_lines[249242]
re.split(r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',g)

["'111.22.117.229,",
 '111.22.117.229',
 '-',
 '-',
 '[19/Sep/2018:22:17:40 +0200]',
 '"GET /agent/10577/bdl HTTP/1.1"',
 '204',
 '-',
 '"-"',
 '"okhttp/3.8.0"',
 'apibackend.site.fr',
 '429282']

In [14]:
#Remember, there are 2 kinds of unusual lines. luckily, our split works for both :
g=wrong_lines[249386]
re.split(r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',g)

["'111.22.116.184",
 '"111.22.105.253, 111.22.105.253"',
 '-',
 '-',
 '[19/Sep/2018:22:20:22 +0200]',
 '"GET /agent/5983/bdl HTTP/1.1"',
 '204',
 '-',
 '"-"',
 '"okhttp/3.8.0"',
 'apibackend.site.fr',
 '512517']

That looks promising! Now let's explain our dataframe the group/column correspondance : 

In [9]:
def unusual_line_to_dic(line):
    groups=re.split(r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',line)
    dic = {}
    dic['ip'] = groups[0]+groups[1]
    dic['time'] = groups[4]
    dic['request'] = groups[5]
    dic['status'] = groups[6]
    dic['referer'] = groups[8]
    dic['user_agent'] = groups[9]
    dic['req_time'] = groups[11]
    return dic

In [10]:
unusual_line_to_dic(wrong_lines[249242])

{'ip': "'111.22.117.229,111.22.117.229",
 'time': '[19/Sep/2018:22:17:40 +0200]',
 'request': '"GET /agent/10577/bdl HTTP/1.1"',
 'status': '204',
 'referer': '"-"',
 'user_agent': '"okhttp/3.8.0"',
 'req_time': '429282'}

Great, we can add the unusual lines now :

In [11]:
for k in wrong_lines.keys():
    df = df.append(unusual_line_to_dic(wrong_lines[k]), ignore_index=True)

In [12]:
df

Unnamed: 0,ip,time,request,status,size,referer,user_agent,req_time
0,111.22.115.161,[19/Sep/2018:06:25:05 +0200],"""GET /agent/4231/bdl HTTP/1.1""",204,,"""-""","""okhttp/3.8.0""",1296274
1,111.22.115.161,[19/Sep/2018:06:25:05 +0200],"""GET /agent/4230/bdl HTTP/1.1""",204,,"""-""","""okhttp/3.8.0""",1354244
2,111.22.116.184,[19/Sep/2018:06:25:05 +0200],"""GET /agent/4652/bdl HTTP/1.1""",204,,"""-""","""okhttp/3.8.0""",1478761
3,111.22.115.161,[19/Sep/2018:06:25:05 +0200],"""GET /agent/5402/bdl HTTP/1.1""",204,,"""-""","""okhttp/3.8.0""",1620763
4,111.22.116.184,[19/Sep/2018:06:25:07 +0200],"""GET /agent/7978/bdl HTTP/1.1""",204,,"""-""","""okhttp/3.8.0""",318981
5,111.22.116.184,[19/Sep/2018:06:25:05 +0200],"""GET /agent/6508/bdl HTTP/1.1""",204,,"""-""","""okhttp/3.8.0""",1804195
6,111.22.115.161,[19/Sep/2018:06:25:06 +0200],"""GET /agent/6531/bdl HTTP/1.1""",204,,"""-""","""okhttp/3.8.0""",875978
7,111.22.115.161,[19/Sep/2018:06:25:07 +0200],"""GET /agent/10546/bdl HTTP/1.1""",204,,"""-""","""okhttp/3.8.0""",287595
8,111.22.116.184,[19/Sep/2018:06:25:07 +0200],"""GET /agent/7030/bdl HTTP/1.1""",204,,"""-""","""okhttp/3.8.0""",366385
9,111.22.115.161,[19/Sep/2018:06:25:07 +0200],"""GET /bdl/1007762/updated_at?datetime=2018-09-...",401,42.0,"""-""","""okhttp/3.8.0""",25763
