# Data Cleaning : Converting proxy logs to network traffic time series.
### Proxy logs are the log of each request made to the internet via a proxy server, saved in squid format, it contains information about the request made such as time as which request was made, IP address making the request, URL accessed, bytes exchanged, total time taken to respond to the request, cache hit or miss etc.
### From these logs we counted the requsts made in a fixed time period (a minute), total bytes exchanged in all those requests, average time taken to respond to those requests. Hence we get a multivariate time-series.

#### Import all the neccessary libraries

In [None]:
import re
import time
import pandas as pd
import glob

#### Read all the proxy log files, which are saved as 1 file for each day

In [None]:
path=('/home/naveksha/A/*')
list1=glob.glob(path)

#### Open files one by one to convert the raw proxy data to useful information (i.e. time series)

In [None]:
for i in range(len(list1)):
    with open(list1[i], encoding="latin-1") as data:
        #separate each line of data by newline(\n)
        line=[line.rstrip('\n') for line in data]
        df2=pd.DataFrame(line)
        df2.columns=['raw']
        df2=pd.DataFrame(df2.raw.str.split().tolist())
    
        #choose only useful columns of dataframe
        df=(df2.iloc[:,0:5])
        df.columns=['time','respt','ip','tcp','bytes']
        df=df[['time','respt','bytes']]
        
        #convert datatype of columns to float
        df.time=pd.Series(df.time).astype(float)
        df.bytes=pd.to_numeric(df.bytes)
        df.respt=pd.to_numeric(df.respt)
        
        #convert epoch time to human readable time
        df['time1']=df.time.apply(lambda x: time.strftime('%Y-%m-%d %H:%M', time.localtime(x)))
        
        #club the data of each minute
        df2= df.groupby(['time1'])['respt'].mean().to_frame().reset_index()
        df3= df.groupby(['time1'])['bytes'].sum().to_frame().reset_index()
        df4= df.groupby(['time1'])['respt'].count().to_frame().reset_index()
        df4=df4.rename(index=str, columns={"respt":"nreq"})
        df5=pd.merge(df2, df3, how='left')
        df6=pd.merge(df5, df4, how='left')
        
        #save the information
        df6.to_csv(list1[i])
