## Read data

In [16]:
import pandas as pd

In [1]:
templates = pd.read_csv("../input/hdfs-dataset/hdfs_templates.csv")

In [2]:
templates.head()

Unnamed: 0,EventId,EventTemplate,Occurrences
0,09a53393,Receiving block <*> src: <*> dest: <*>,1723232
1,3d91fa85,BLOCK* NameSystem.allocateBlock: <*> <*>,575061
2,d38aa58d,PacketResponder <*> for block <*> <*>,1706728
3,e3df2680,Received block <*> of size <*> from <*>,1706514
4,5d5de21c,BLOCK* NameSystem.addStoredBlock: blockMap upd...,1719741


In [3]:
df = pd.read_csv("../input/hdfs-dataset/hdfs_structured.csv")

In [4]:
df.head()

Unnamed: 0,LineId,Date,Time,Pid,Level,Component,Content,EventId,EventTemplate
0,1,81109,203518,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,09a53393,Receiving block <*> src: <*> dest: <*>
1,2,81109,203518,35,INFO,dfs.FSNamesystem,BLOCK* NameSystem.allocateBlock: /mnt/hadoop/m...,3d91fa85,BLOCK* NameSystem.allocateBlock: <*> <*>
2,3,81109,203519,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,09a53393,Receiving block <*> src: <*> dest: <*>
3,4,81109,203519,145,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,09a53393,Receiving block <*> src: <*> dest: <*>
4,5,81109,203519,145,INFO,dfs.DataNode$PacketResponder,PacketResponder 1 for block blk_-1608999687919...,d38aa58d,PacketResponder <*> for block <*> <*>


## Encode categorical data as discrete variables

In [6]:
from sklearn.preprocessing import LabelEncoder

# columns to encode
cols_to_encode = ["Level","Component"]
encoders = {}
for col in cols_to_encode:
    col_encoder = LabelEncoder()
    df[col] = col_encoder.fit_transform(df[col])
    encoders[col] = col_encoder
    
# take event tempplates from template file
template_encoder = LabelEncoder()

templates["TemplateId"] = template_encoder.fit_transform(templates["EventTemplate"])
df["TemplateId"] = template_encoder.transform(df["EventTemplate"]) # apply encodin to event data

encoders["EvenTemplate"] = (template_encoder,"TemplateId")

# sort template dataset by template id
templates.sort_values("TemplateId",inplace=True)
templates.set_index(np.arange(templates.shape[0]),inplace=True)

In [7]:
templates.to_hdf("../input/hdfs-dataset/hdfs_templates.hd5","templates",mode="w")

Unnamed: 0,LineId,Date,Time,Pid,Level,Component,Content,EventId,EventTemplate
0,1,81109,203518,143,0,4,Receiving block blk_-1608999687919862906 src: ...,09a53393,Receiving block <*> src: <*> dest: <*>
1,2,81109,203518,35,0,7,BLOCK* NameSystem.allocateBlock: /mnt/hadoop/m...,3d91fa85,BLOCK* NameSystem.allocateBlock: <*> <*>
2,3,81109,203519,143,0,4,Receiving block blk_-1608999687919862906 src: ...,09a53393,Receiving block <*> src: <*> dest: <*>
3,4,81109,203519,145,0,4,Receiving block blk_-1608999687919862906 src: ...,09a53393,Receiving block <*> src: <*> dest: <*>
4,5,81109,203519,145,0,5,PacketResponder 1 for block blk_-1608999687919...,d38aa58d,PacketResponder <*> for block <*> <*>


In [10]:
# save the encoders to disk
import pickle

# store the encoders
with open("encoders.pkl","wb") as enc:
    enc.write(pickle.dumps(encoders))

## Ok let's start inspecting the data

In [10]:
for tmplid, tmplexp in templates[["TemplateId","EventTemplate"]].values:
    print(tmplid,tmplexp)

0 <*> <*> <*> <*> java.io.IOException: Connection reset by peer
1 <*> <*> <*> <*> java.io.InterruptedIOException: Interruped while waiting for IO on channel java.nio.channels.SocketChannel[connected <*> <*> <*> millis timeout left.
2 <*> <*> <*> <*> java.net.SocketTimeoutException: <*> millis timeout while waiting for channel to be ready for <*> ch : java.nio.channels.SocketChannel[connected <*> <*>
3 <*> Served block <*> to <*>
4 <*> Starting thread to transfer block <*> to <*>
5 <*> Starting thread to transfer block <*> to <*> <*>
6 <*> block <*> to <*>
7 <*> exception while serving <*> to <*>
8 <*> to transfer <*> to <*> got java.io.IOException: Connection reset by peer
9 <*> writing block <*> to mirror <*>
10 Adding an already existing block <*>
11 BLOCK* NameSystem.addStoredBlock: Redundant addStoredBlock request received for <*> on <*> size <*>
12 BLOCK* NameSystem.addStoredBlock: addStoredBlock request received for <*> on <*> size <*> But it does not belong to any file.
13 BLOCK

## Possible issues with some of the event templates?

```
1 <*> <*> <*> <*> java.io.InterruptedIOException: Interruped while waiting for IO on channel java.nio.channels.SocketChannel[connected <*> <*> <*> millis timeout left.
2 <*> <*> <*> <*> java.net.SocketTimeoutException: <*> millis timeout while waiting for channel to be ready for <*> ch : java.nio.channels.SocketChannel[connected <*> <*>

24 Exception in receiveBlock for block <*> java.io.InterruptedIOException: Interruped while waiting for IO on channel java.nio.channels.SocketChannel[connected <*> <*> <*> millis timeout left.
25 Exception in receiveBlock for block <*> java.net.SocketTimeoutException: <*> millis timeout while waiting for channel to be ready for write. ch : java.nio.channels.SocketChannel[connected <*> <*>

26 PacketResponder <*> 1 Exception java.io.IOException: The stream is closed
27 PacketResponder <*> 1 Exception java.io.InterruptedIOException: Interruped while waiting for IO on channel java.nio.channels.SocketChannel[closed]. <*> millis timeout left.
28 PacketResponder <*> <*> Exception <*>
29 PacketResponder <*> <*> Exception java.io.IOException: Broken pipe

43 writeBlock blk_1684134505299265593 received exception java.net.NoRouteToHostException: No route to host
```


## Let's look at the corresponding event data

In [11]:
grouped = df.groupby("TemplateId")

NameError: name 'df' is not defined

In [12]:
grouped.get_group(43)

NameError: name 'grouped' is not defined

Only one entry for template 43 `blk_...` was not detected as a template argument?

In [19]:
grouped.get_group(24)["Content"].values.tolist()

['Exception in receiveBlock for block blk_3858821904894294369 java.io.InterruptedIOException: Interruped while waiting for IO on channel java.nio.channels.SocketChannel[connected local=/10.250.13.188:58401 remote=/10.251.39.160:50010]. 489996 millis timeout left.',
 'Exception in receiveBlock for block blk_-4567777441263358151 java.io.InterruptedIOException: Interruped while waiting for IO on channel java.nio.channels.SocketChannel[connected local=/10.251.199.225:50760 remote=/10.251.107.227:50010]. 489959 millis timeout left.',
 'Exception in receiveBlock for block blk_7008279672769077211 java.io.InterruptedIOException: Interruped while waiting for IO on channel java.nio.channels.SocketChannel[connected local=/10.251.31.180:50010 remote=/10.251.123.33:33680]. 0 millis timeout left.',
 'Exception in receiveBlock for block blk_8085135783040518166 java.io.InterruptedIOException: Interruped while waiting for IO on channel java.nio.channels.SocketChannel[connected local=/10.250.5.237:46933

In [20]:
grouped.get_group(25)["Content"].values.tolist()

['Exception in receiveBlock for block blk_7008279672769077211 java.net.SocketTimeoutException: 490000 millis timeout while waiting for channel to be ready for write. ch : java.nio.channels.SocketChannel[connected local=/10.251.42.191:43873 remote=/10.251.123.33:50010]',
 'Exception in receiveBlock for block blk_6224343649004202692 java.net.SocketTimeoutException: 490000 millis timeout while waiting for channel to be ready for write. ch : java.nio.channels.SocketChannel[connected local=/10.251.127.243:44092 remote=/10.251.107.50:50010]',
 'Exception in receiveBlock for block blk_-6363674043695218814 java.net.SocketTimeoutException: 490000 millis timeout while waiting for channel to be ready for write. ch : java.nio.channels.SocketChannel[connected local=/10.250.15.101:33683 remote=/10.251.107.50:50010]',
 'Exception in receiveBlock for block blk_584730932939516842 java.net.SocketTimeoutException: 485000 millis timeout while waiting for channel to be ready for write. ch : java.nio.channe

### Fix 24 as

``
Exception in receiveBlock for block <*> java.io.InterruptedIOException: Interruped while waiting for IO on channel java.nio.channels.SocketChannel[connected <*> <*> <*> millis timeout left.
``

``
Exception in receiveBlock for block <*> java.io.InterruptedIOException: Interruped while waiting for IO on channel java.nio.channels.SocketChannel[connected <*> <*> <*>]. millis timeout left.
``

And similarly for 25


In [21]:
grouped.get_group(26)["Content"].values.tolist()

['PacketResponder blk_8006271611835981128 1 Exception java.io.IOException: The stream is closed',
 'PacketResponder blk_-2448583810301041956 1 Exception java.io.IOException: The stream is closed']

In [22]:
grouped.get_group(27)["Content"].values.tolist()

['PacketResponder blk_-4250706752040073149 1 Exception java.io.InterruptedIOException: Interruped while waiting for IO on channel java.nio.channels.SocketChannel[closed]. 59985 millis timeout left.',
 'PacketResponder blk_-4723951162006187997 1 Exception java.io.InterruptedIOException: Interruped while waiting for IO on channel java.nio.channels.SocketChannel[closed]. 59799 millis timeout left.',
 'PacketResponder blk_8895996987918469659 1 Exception java.io.InterruptedIOException: Interruped while waiting for IO on channel java.nio.channels.SocketChannel[closed]. 59852 millis timeout left.',
 'PacketResponder blk_2832388045942608634 1 Exception java.io.InterruptedIOException: Interruped while waiting for IO on channel java.nio.channels.SocketChannel[closed]. 59990 millis timeout left.',
 'PacketResponder blk_-644449815674781039 1 Exception java.io.InterruptedIOException: Interruped while waiting for IO on channel java.nio.channels.SocketChannel[closed]. 59983 millis timeout left.']

### Guess that 1 was not detected as a template argument

In [23]:
grouped.get_group(28)["Content"].values.tolist()

['PacketResponder blk_-3102267849859399193 2 Exception java.io.EOFException',
 'PacketResponder blk_1528078116812077719 2 Exception java.io.EOFException',
 'PacketResponder blk_8085135783040518166 1 Exception java.nio.channels.ClosedByInterruptException',
 'PacketResponder blk_8516616325149469651 1 Exception java.nio.channels.ClosedByInterruptException',
 'PacketResponder blk_-3794935441407979230 2 Exception java.io.EOFException',
 'PacketResponder blk_-516029862489211143 2 Exception java.io.EOFException',
 'PacketResponder blk_844104329351281298 2 Exception java.io.EOFException',
 'PacketResponder blk_9099447004863711723 1 Exception java.nio.channels.ClosedByInterruptException',
 'PacketResponder blk_-6680054068554267213 2 Exception java.io.EOFException',
 'PacketResponder blk_6495484866542253279 1 Exception java.nio.channels.ClosedByInterruptException',
 'PacketResponder blk_-539148231797001003 2 Exception java.io.EOFException',
 'PacketResponder blk_3709643589615046481 2 Exception j

In [24]:
grouped.get_group(29)["Content"].values.tolist()

['PacketResponder blk_3858821904894294369 0 Exception java.io.IOException: Broken pipe',
 'PacketResponder blk_-4567777441263358151 0 Exception java.io.IOException: Broken pipe',
 'PacketResponder blk_8085135783040518166 0 Exception java.io.IOException: Broken pipe',
 'PacketResponder blk_8516616325149469651 2 Exception java.io.IOException: Broken pipe',
 'PacketResponder blk_-7052911997539087826 2 Exception java.io.IOException: Broken pipe',
 'PacketResponder blk_9160796703302881264 2 Exception java.io.IOException: Broken pipe']

### Should 29 be split in two?

## Fix issues with 26, 27 and 42

In [13]:
templates["EventTemplate"].iloc[[26,27]] = templates["EventTemplate"].iloc[[26,27]].apply(lambda x: x.replace("<*> 1","<*> <*>"))
templates["EventTemplate"].iloc[43] = templates["EventTemplate"].iloc[43].replace("blk_1684134505299265593","<*>")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [14]:
templates.iloc[[26,27,43]]["EventTemplate"].tolist()

['PacketResponder <*> <*> Exception java.io.IOException: The stream is closed',
 'PacketResponder <*> <*> Exception java.io.InterruptedIOException: Interruped while waiting for IO on channel java.nio.channels.SocketChannel[closed]. <*> millis timeout left.',
 'writeBlock <*> received exception java.net.NoRouteToHostException: No route to host']

In [15]:
templates.to_hdf("../input/hdfs-dataset/hdfs_templates.hd5","templates",mode="w")

## Extract template arguments for each line

In [27]:
import re

# make regular expression to extract template arguments from event content
def make_regex(tmpl):
    escape = "[]()*"
    for ch in escape:
        tmpl = tmpl.replace(ch,"\\"+ch)

    match_entity = r"([^ ]*)"
    tmpl = tmpl.replace("<\\*>",match_entity)
    
    return re.compile(tmpl)

regexps = templates["EventTemplate"].apply(make_regex)


In [28]:
# apply regex to event data
def apply_regex(x):
    match = regexps.iloc[x["TemplateId"]].match(x["Content"])
    if match is None:
        # print(regexps.iloc[x[0]],x[1])
        return None
    return ":::".join(match.groups())

df["TemplateArgs"] = df[['TemplateId','Content']].apply(apply_regex,raw=False,axis=1)

## Add time index

In [None]:
hours = df.Time // 10000
mins = ( df.Time % 10000 ) // 100
secs = ( df.Time % 10000 ) % 100

year = 2000 + (df.Date // 10000)
month = (df.Date % 10000) // 100
day = (df.Date % 10000) % 100

dfdate = pd.DataFrame({'year': year,
                       'month': month,
                       'day': day,
                       'hours': hours,
                       'minutes':mins,
                       'seconds':secs
                      })

df = df.join(dfdate)

df.index = pd.to_datetime(dfdate)

In [31]:
df[["LineId","Pid","Level","Component","TemplateId","TemplateArgs"]+dfdate.columns.tolist()].to_hdf("../input/hdfs-dataset/hdfs_structured_datetime.hd5","hdfs_structured",mode="w",format="t")