In [1]:
import pandas as pd
import numpy as np
import csv,re,os
import datetime as dt
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
import random

In [2]:
origin = '/pylon5/sy5fp1p/ehanna/logs/hosts/'

In [3]:
pattern = re.compile('(\d+-\d+-\d+)T(\d+:\d+:\d+)-(\d+:\d+)\s(.*?)\.pvt')
timePattern = re.compile('.*T(\d+:\d+:\d+)-\d+:\d+\s.*\.pvt.*')
jobidpattern = re.compile('JID:(\d+).*prolog started')

In [4]:
# date: Date for which corpus has to be made
# node: Node for which corpus has to be made
# size: Size of the sliding window. How many minutes should it study?
# slide: Shift in the sliding window
# check: Number of seconds after the current time stamp for which to check if an nfs error occured
def makeCorpus(date,node,size,slide,check):
    direc = node +'.pvt.bridges.psc.edu'
    fileName = date+'-'+direc+'.log'
    with open(origin+direc+'/'+fileName) as file:
        windowStarters = []
        CorrespondingCorpuses = []
        for line in file:
            match = re.search(timePattern,line)
            if match is not None:
                currentTime = match.group(1)
                if len(windowStarters)==0:
                    windowStarters.append(currentTime)
                    CorrespondingCorpuses.append([line,0])
                    indexCheckStart = -1
                else:
                    i = len(windowStarters)-1
                    while i>indexCheckStart:
                        checkTime = windowStarters[i]
                        FMT = '%H:%M:%S'
                        tdelta = datetime.strptime(currentTime, FMT) - datetime.strptime(checkTime, FMT)
                        sizeTimeRep = str(dt.timedelta(seconds=size))
                        errorTimeCheck = size+check
                        nextWindowStartTime = str(dt.timedelta(seconds=slide))
                        errorTimeCheckRep = str(dt.timedelta(seconds=errorTimeCheck))
                        if i==len(windowStarters)-1 and (str(tdelta)>=errorTimeCheckRep):
                            windowStarters.append(currentTime)
                            if 'not responding' not in line and 'nfs: server ' not in line:
                                CorrespondingCorpuses.append([line,0])
                            else:
                                CorrespondingCorpuses.append(['',0])
                            indexCheckStart +=1
                            break
                        elif (str(tdelta)<=sizeTimeRep):
                            if 'not responding' not in line and 'nfs: server ' not in line:
                                CorrespondingCorpuses[i][0]+= ' '+line
                                if i==len(windowStarters)-1 and (str(tdelta)>=nextWindowStartTime):
                                    windowStarters.append(currentTime)
                                    line = re.sub(r'\s\d+\s', ' ', line)
                                    if 'not responding' not in line and 'nfs: server ' not in line:
                                        CorrespondingCorpuses.append([line,0])
                                    else:
                                        CorrespondingCorpuses.append(['',0])
                        elif (str(tdelta)<=errorTimeCheckRep):
                            if 'not responding' in line and 'nfs: server ' in line:
                                CorrespondingCorpuses[i][1]=1
                        else:
                            indexCheckStart +=1
                            break
                        i=i-1
    windowStarters = [date+'_'+i+'_'+node for i in windowStarters]
    return (CorrespondingCorpuses,windowStarters)

In [6]:
textCorpuseswithLabels = makeCorpus('2018-02-04','r150',120,60,30)

In [5]:
def corpusToBagOfWords(corpus,labels,index):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    columns = vectorizer.get_feature_names()
    resultDF = pd.DataFrame(X.toarray(),columns=columns,index = index)
    resultDF['Label'] = labels
    return resultDF

In [1]:
df = corpusToBagOfWords(corpus,labels,textCorpuseswithLabels[1])

In [2]:
df.head()

In [9]:
text = "<38>1 2018-02-02T23:56:00-05:00 r256.pvt.bridges.psc.edu systemd-logind - - -  New session 4434 of user monitor."
text = (re.sub(r'\s\d+\s',' ',text))
print(text)

<38>1 2018-02-02T23:56:00-05:00 r256.pvt.bridges.psc.edu systemd-logind - - -  New session of user monitor.


In [6]:
def makeCorpusDataFrame(dateRange, nodeMinimum,nodeMaximum):
    startDate = datetime.strptime(dateRange.split(':')[0],"%Y-%m-%d")
    endDate = datetime.strptime(dateRange.split(':')[1],"%Y-%m-%d")
    currentDate = startDate
    corpusMetaData = []
    labelsMetaData = []
    windowStarters = []
    startTime = datetime.now()
    while (currentDate<endDate):
        count=0
        for direc in os.listdir(origin):
            if direc.startswith('r') and direc.endswith('.pvt.bridges.psc.edu'):
                match = re.search(r'(r\d+).pvt.bridges.psc.edu',direc)
                if match is not None:
                    node = match.group(1)
                    if int(node.strip('r'))> nodeMinimum and int(node.strip('r'))< nodeMaximum:
                        count+=1
                        fileName = currentDate.strftime("%Y-%m-%d")+'-'+direc+'.log'
                        if fileName in os.listdir(origin+direc+'/'):
                            with open(origin+direc+'/'+fileName) as file:
                                textCorpuseswithLabels = makeCorpus(currentDate.strftime("%Y-%m-%d"),node,300,120,60)
                                # textStrings = text.split('\n')
                                for line in textCorpuseswithLabels[0]:
                                    corpusMetaData.append(line[0])
                                    labelsMetaData.append(line[1])
                                for line in textCorpuseswithLabels[1]:
                                    windowStarters.append(line)  
        currentDate = currentDate+dt.timedelta(days=1)
    endTime = datetime.now()
    df = corpusToBagOfWords(corpusMetaData,labelsMetaData,windowStarters)
    print(endTime-startTime)
    return df

In [7]:
# Logistic Regression
# Assumes the last column of data is the output dimension
def get_pred_logreg(train,test):
    # Your implementation goes here
    # You may leverage the linear_model module from sklearn (scikit-learn)
    # return (predicted output, actual output)
    n, m = train.shape  # number of rows and columns
    X_train = train.iloc[:,:m - 1]  # get training input data
    # print(X_train)
    Y_train = train.iloc[:,-1]
    X_test = test.iloc[:, :m - 1]  # get test input data
    Y_test = test.iloc[:,- 1]  # get test input data
    lm = LogisticRegression().fit(X_train,Y_train)
    pred = lm.predict(X_test)
    return pd.DataFrame({'Prediction':pred,'Observed':Y_test},columns=['Prediction','Observed'])


In [8]:
#your implementation of do_cv_class goes here
def do_cv_class(df, num_folds, model_name):
    Y = df.iloc[:,-1]
    kf = KFold(n_splits=num_folds,shuffle=True,random_state=2)
    predDF = pd.DataFrame(columns=['Prediction','Observed','Fold'])
    count = 1
    for train_index,test_index in kf.split(df):
        if model_name=="logreg":
            modelpredDF = get_pred_logreg(df.iloc[train_index],df.iloc[test_index])
        elif model_name=="svm":
            modelpredDF = get_pred_svm(df.iloc[train_index],df.iloc[test_index])
        elif model_name=="nb":
            modelpredDF = get_pred_nb(df.iloc[train_index],df.iloc[test_index])
        elif model_name[-2:]=="nn":
            modelpredDF = get_pred_knn(df.iloc[train_index],df.iloc[test_index],int(model_name[:-2]))
        elif model_name=="default":
            modelpredDF = get_pred_default(df.iloc[train_index],df.iloc[test_index])
        modelpredDF['Fold']=count
        count+=1
        predDF = predDF.append(modelpredDF)
    return predDF

In [13]:
# columnsToKeep=[]
# for i in trainingDF.columns:
#     if any(char.isdigit() for char in i):
#         continue
#     else:
#         columnsToKeep.append(i)
# trainingDF = trainingDF.loc[:,columnsToKeep]
# predDF = do_cv_class(trainingDF,10,'logreg')

In [14]:
# predDF['Accuracy'] = predDF.apply(lambda r: 1 if r['Observed']==r['Prediction'] else 0,axis=1)
# np.sum(predDF['Accuracy'])/float(len(predDF))

In [15]:
# print(confusion_matrix(predDF['Prediction'],predDF['Observed'],labels=[0,1]))

In [16]:
# predDF['Prediction'] = predDF['Prediction'].astype(str).astype(int)
# predDF['Observed'] = predDF['Observed'].astype(str).astype(int)

In [9]:
def bagOfWordsResults(dateRange, nodeMinimum,nodeMaximum):
    trainingDF = makeCorpusDataFrame(dateRange,nodeMinimum,nodeMaximum)
    columnsToKeep=[]
    for i in trainingDF.columns:
        if any(char.isdigit() for char in i):
            continue
        else:
            columnsToKeep.append(i)
    trainingDF = trainingDF.loc[:,columnsToKeep]
    predDF = do_cv_class(trainingDF,10,'logreg')
    predDF['Prediction'] = predDF['Prediction'].astype(str).astype(int)
    predDF['Observed'] = predDF['Observed'].astype(str).astype(int)
    predDF['Accuracy'] = predDF.apply(lambda r: 1 if r['Observed']==r['Prediction'] else 0,axis=1)
    print(np.sum(predDF['Accuracy'])/float(len(predDF)))
    cf = confusion_matrix(predDF['Prediction'],predDF['Observed'],labels=[0,1])
    return (cf[0][0],cf[0][1],cf[1][0],cf[1][1])

In [18]:
AllPredDF = pd.DataFrame(columns=['Observed','Prediction'])

In [19]:
nodes = pd.read_csv("nodes.csv")

In [20]:
nodes = nodes.loc[:,['Node Number','Date']]

In [36]:
fpr2=[]
tpr2=[]
for i in range(len(answerResults)):
    try:
        fpr.append(answerResults[i][1])
        tpr.append(answerResults[i][0])
    except Exception as e:
        continue

In [10]:
results = bagOfWordsResults("2018-02-04:2018-02-06",25,66)

0:01:20.585316
0.9973313810693368
