In [1]:

# coding: utf-8

# In[6]:


import urllib.request
import zipfile
import os
import pandas as pd
import logging
import time
import datetime
import shutil
import glob
import sys
from itertools import groupby
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import boto
from boto.s3.key import Key

x = str(input("Enter Year: "))

#pass credentials
accesskey = input("Input AWS access key")
secretaccesskey = input("Input AWS secret access key")

#YOUR_ACCESS_KEY
aws_access_key_id = accesskey
#YOUR_SECRET_KEY
aws_secret_access_key = secretaccesskey

try:
    s3_connection = boto.connect_s3(aws_access_key_id, aws_secret_access_key)
    conn_check = s3_connection.get_all_buckets()

except:
    print("AWS keys invalid. Please try again")
    sys.exit()

logtime = time.time()
logdate = datetime.datetime.fromtimestamp(logtime).strftime('%Y%m%d_%H%M%S')

logName = "logs_" + x + '.txt'

logging.basicConfig(filename=logName, level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

year=range(2003,2017)
if int(x) not in year:
    print("Data for the given year does not exist!")
    sys.exit(0)

matplotlib.use("Agg")
    
if not os.path.exists(x+'_zip'):
    os.makedirs(x+'_zip', mode=0o777)
    logging.info('Zipped file directory created!!')
else:
    shutil.rmtree(os.path.join(os.path.dirname("__file__"),x+'_zip'), ignore_errors=False)
    os.makedirs(x+'_zip', mode=0o777)
    logging.info('Zipped file directory created!!')
    
if not os.path.exists(x):
    os.makedirs(x, mode=0o777)
    logging.info('UnZipped file directory created!!')
else:
    shutil.rmtree(os.path.join(os.path.dirname("__file__"), x), ignore_errors=False)
    os.makedirs(x, mode=0o777)
    logging.info('UnZipped file directory created!!')

Quaters = {'Qtr1': ['01', '02', '03'], 'Qtr2': ['04', '05', '06'], 'Qtr3': ['07', '08', '09'], 'Qtr4': ['10', '11', '12']}
days= range(1, 32)
for key,value in Quaters.items():
    for val in value:
        for d in days:
            url= 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/' + str(x) + '/'+ str(key)+ '/log'+ str(x)+str(val)+str(format(d,'02d')) +'.zip'
            print(url)
            urllib.request.urlretrieve(url, x+'_zip/'+url[-15:])
            logging.info("Retrieving zipped log file")
            if os.path.getsize( x+'_zip/'+url[-15:]) <= 4515:
                os.remove( x+'_zip/'+url[-15:])
                logging.info("Log file is not present for "+ str(d) + " day")
                continue
            break

            
zip_files = os.listdir(x+'_zip')
for f in zip_files:
    z = zipfile.ZipFile(os.path.join(x+'_zip', f), 'r')
    for file in z.namelist():
        if file.endswith('.csv'):
            z.extract(file, x)
            logging.info(file +' successfully extracted to folder: unzippedfiles.')

allFiles = glob.glob(x+"/*csv")

if not os.path.exists('Reports'):
    os.makedirs('Reports', mode=0o777)
    logging.info('Graphical_images directory created!!')
else:
    shutil.rmtree(os.path.join(os.path.dirname("__file__"),'Reports'), ignore_errors=False)
    os.makedirs('Reports', mode=0o777)
    logging.info('Graphical_images directory created!!')

i=1
for file_ in allFiles:
    data = pd.read_csv(file_)
    logging.info("Calculating missing values in coloumns")
    print(data.isnull().sum())
    logging.info("Finding the NaN values in each coloumn")
    logging.info("Exploring Browser Coloumn")
    a = ['mie','fox','saf','chr','sea', 'opr','oth','win','mac','lin','iph','ipd','and','rim','iem']
    [len(list(group)) for key, group in groupby(a)] 
    logging.info("Grouping the values of all the browsers and storing in a dataframe")
    df = pd.DataFrame(data,columns = ['browser'])
    d = df.apply(pd.value_counts)
    logging.info("Counting the frequency of each browser type used in descending order")
    list(d.index)
    data['browser'].replace(np.nan,d.index[0], inplace = True) #Replacing the NaN values in the browser colomun with the max used browser
    data.isnull().sum()
    logging.info("confirming that no NaN values are present on the browser coloumn")
    logging.info("Working on the Size Coloumn")
    logging.info("Replacing the file size for ext : txt, by the mean of all the file size corresponding to txt")
    s = data[['extention','size']].groupby(data['extention'].str.contains('txt'))['size'].mean().reset_index(name='mean').sort_values(['mean'],ascending=False)
    data.loc[(data['size'].isnull()) & (data['extention'].str.contains('txt'))] = s
    data.reset_index(drop = True)
    logging.info("Replacing the file size with NaN values for ext : htm, by the mean of all the file size corresponding to htm") 
    g = data[['extention','size']].groupby(data['extention'].str.contains('htm'))['size'].mean().reset_index(name='mean').sort_values(['mean'],ascending=False)
    data.loc[(data['size'].isnull()) & (data['extention'].str.contains('htm'))] = g
    data.reset_index(drop=True)
    logging.info("Replacing the file size with NaN values for ext : xml, by the mean of all the file size corresponding to xml")
    h = data[['extention','size']].groupby(data['extention'].str.contains('xml'))['size'].mean().reset_index(name='mean').sort_values(['mean'],ascending=False)
    data.loc[(data['size'].isnull()) & (data['extention'].str.contains('xml'))] = h
    data.reset_index(drop=True)
    logging.info("To check how many NaN values are remaining ")
    logging.info("Replacing the file size for rest of the files with the mean of file size of txt extension, as it is the max used")
    data.loc[data['size'].isnull()] = s
    print(data.isnull().sum())
    logging.info("Working on all other coloumns")
    logging.info("If cik,Accession,ip,date are empty fields drop the records")
    data.dropna(subset=['cik'],inplace=True)
    data.dropna(subset=['accession'],inplace=True)
    data.dropna(subset=['ip'],inplace=True)
    data.dropna(subset=['date'],inplace=True)
    data.dropna(subset=['time'],inplace=True)
    logging.info("Calculating the max categorical value in other coloumns( code, zone,extention,idx,find) and filling the NaNs")
    data['code'].fillna(data['code'].max(),inplace=True)
    data['zone'].fillna(data['zone'].max(),inplace=True)
    data['extention'].fillna(data['extention'].max(),inplace=True)
    data['idx'].fillna(data['idx'].max(),inplace=True)
    data['find'].fillna(data['find'].max(),inplace=True)
    
    logging.info("Filling empty values with Categorical Values for coloumns (norefer,noagent,nocrawler)")
    data['norefer'].fillna(1,inplace=True)
    data['noagent'].fillna(1,inplace=True)
    data['crawler'].fillna(0,inplace=True)
    print(data.isnull().sum())
    logging.info("Missing data is handled successfully")          

#SUMMARY METRICS
    logging.info("Calculating Summary metrics of clean data")
    data.describe()
    data.reset_index(drop = True)
    logging.info("Mean and Median sizes for each Browser")
    brow_df = data.groupby('browser').agg({'size':['mean', 'median'],'crawler': len})
    brow_df.columns = ['_'.join(col) for col in brow_df.columns]
    data.reset_index(drop=True)
    print(brow_df)
                                                                                  
#To find out the 15 top searched CIKs 
    cik_df = pd.DataFrame(data, columns = ['cik'])
    d = cik_df.apply(pd.value_counts)
    logging.info("Top 15 most searched CIKs with the count")                                                                            
    d.head(15)
    data.reset_index(drop=True)
                    
#Compute distinct count of ip per month i.e. per log file
    ipcount_df = data['ip'].nunique()
    logging.info("Compute distinct count of ip per month i.e. per log file")
    print(ipcount_df)
                                                                                  
#Computing the count of status code on the basis of ip
    StCo_count=data[['code','ip']].groupby(['code'])['ip'].count().reset_index(name='count')
    logging.info("Computing the count of status code on the basis of ip")
    print(StCo_count)
    data.reset_index(drop=True)
                    
#Everything on per day basis
    #1. Average of Size 
    Avg_size=data[['date','size']].groupby(['date'])['size'].mean().reset_index(name='mean')
    logging.info("Average of file size is computed")
    print(Avg_size)
    #2. Number Of Requests
    Req_day=data[['date','ip']].groupby(['date'])['ip'].count().reset_index(name='count')
    logging.info("Number of request per day is computed")
    print(Req_day)
#Mean of file size on the basis of code status
    Mean_size=data[['code','size']].groupby(['code'])['size'].mean().reset_index(name='mean')
    logging.info("Mean of file size on the basis of code status")
    print(Mean_size)

    logging.info("Summary metrics computed succesfully!!")
#graph of no of status codes by browser
    try:
        logging.info("graphical analysis started")
        Num_of_codes=data[['browser','code']].groupby(['browser'])['code'].count().reset_index(name = 'count_code').sort_values(['count_code'],ascending=False)
        data.reset_index(drop = True)
        print(Num_of_codes)
        u= np.array(range(len(Num_of_codes)))
        y= Num_of_codes['count_code']
        xticks1 = Num_of_codes['browser']
        plt.xticks(u,xticks1)
        plt.bar(u,y)
        plt.title('Count of status code for all the browsers')
        plt.ylabel('Count of codes')
        plt.xlabel('Browsers')
        plt.savefig('Reports/countsperbrowser'+ str(i) +'.png',dpi=100)
        plt.clf()
        logging.info("graphical analysis end")
    except Exception as e:
        print(u)
        logging.error(str(e))
        logging.error("Error plotting the graph ")
    
#graph for max cik(10) by IP used
    try:
        logging.info("graphical analysis started")
        Num_of_CIKs=data[['cik','ip']].groupby(['cik'])['ip'].count().reset_index(name='count').sort_values(['count'],ascending=False).head(10)
        data.reset_index(drop=True)
        print(Num_of_CIKs)
        u = np.array(range(len(Num_of_CIKs)))
        y = Num_of_CIKs['count']
        xticks2 = Num_of_CIKs['cik']
        plt.xticks(u, xticks2)
        plt.bar(u,y)
        plt.title('Top 10 CIKs by IPs')
        plt.ylabel('Count of IPs')
        plt.xlabel('CIK-')
        plt.savefig('Reports/CIKsbyIPcount'+ str(i) +'.png',dpi=100)
        plt.clf()
        logging.info("graphical analysis end")
    except Exception as e:
        print(u)
        logging.error(str(e))
        logging.error("Error plotting the graph ")

#Graph of Mean of file size on the basis of code status

    try:
        Mean_size=data[['code','size']].groupby(['code'])['size'].mean().reset_index(name='mean').sort_values(['mean'],ascending=False)
        data.reset_index(drop=True)
        print(Mean_size)
        u = np.array(range(len(Mean_size)))
        y = Mean_size['mean']
        xticks3 = Mean_size['code']
        plt.xticks(u, xticks3)
        plt.bar(u,y)
        plt.title('filesize by codes')
        plt.ylabel('mean size')
        plt.xlabel('Code')
        plt.savefig('Reports/MeanSizeByCode'+str(i) +'.png',dpi=100)
        plt.clf()
    except Exception as e:
        print(u)
        logging.error(str(e))
        logging.error("Error plotting the graph ")
#Graph for average file size by extension
    try:
        Avg_size=data[['extention','size']].groupby(['extention'])['size'].mean().reset_index(name='mean').sort_values(['mean'],ascending=False).head(20)
        data.reset_index(drop=True)
        print(Avg_size)
        u = np.array(range(len(Avg_size)))
        y = Avg_size['mean']
        xticks4 = Avg_size['extention']
        plt.xticks(u, xticks4)
        plt.bar(u,y)
        plt.title('Avg File size by extention')
        plt.ylabel('MeanFileSize')
        plt.xlabel('Extention')
        #plt.savefig(os.path.join('Graphical_images',str(val),'filesizebyextention.png'),dpi=100)
        plt.savefig('Reports/filesizebyextention'+str(i) +'.png',dpi=100)
        #plt.show()
        plt.clf()
    except Exception as e:
        print(u)
        logging.error(str(e))
        logging.error("Error plotting the graph ")
#ANAMOLIES IN FILESIZE
    try:
        logging.info("Anomalies analysis started")
        data.boxplot(column='size',vert=True,sym='',whis=10,showfliers=False)
        plt.xticks(rotation=70)
        plt.title('Anomalies displayed on the file size')
        plt.ylabel('size')
        plt.savefig('Reports/'+'Anomalies'+ str(i) +'.png',dpi=100)
        #plt.show()
        logging.info("Anomalies analysis ended")
    except Exception as e:
        print(u)
        logging.error(str(e))
        logging.error("Error plotting the graph ")

    i = i+1

  
#Making a zip file having the log file and the Graphical_images folder
def make_zipfile(output_filename, source_dir):
    relroot = os.path.abspath(os.path.join(source_dir, os.pardir))
    with zipfile.ZipFile(output_filename, "w", zipfile.ZIP_DEFLATED) as zip:
        for root, dirs, files in os.walk(source_dir):
            # add directory (needed for empty dirs)
            zip.write(root, os.path.relpath(root, relroot))
            for file in files:
                filename = os.path.join(root, file)
                if os.path.isfile(filename):
                    arcname = os.path.join(os.path.relpath(root, relroot), file)
                    zip.write(filename, arcname)

make_zipfile('ADSAssign1Part2.zip','Reports')
print("Done")

bucketname = accesskey.lower() + "nuadsgroup3" + str(x)

bucket = s3_connection.create_bucket(bucketname)
logging.debug("Creating AWS S3 bucket " + bucketname)

upload_to = Key(bucket)
upload_to.key = 'problem2'

logging.debug("Zip File & Log File Uploaded to AWS S3 bucket " + bucketname)
upload_to.set_contents_from_filename(str("ADSAssign1Part2.zip"))
upload_to.set_contents_from_filename(str(logName))







Enter Year: 2007
Input AWS access keyAKIAIFWGOFHFY2P3W6UQ
Input AWS secret access keyiwSeDcIWAJ2z2cvfcHqgxHBEuO00TtDbHm02Xn5W


This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "C:\Users\achar\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\achar\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\achar\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\achar\Anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\achar\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 486, in start
    self.io_loop.start()
  File "C:\Users\achar\Anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 127, in start
    se

http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2007/Qtr1/log20070101.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2007/Qtr1/log20070201.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2007/Qtr1/log20070301.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2007/Qtr2/log20070401.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2007/Qtr2/log20070501.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2007/Qtr2/log20070601.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2007/Qtr3/log20070701.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2007/Qtr3/log20070801.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2007/Qtr3/log20070901.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2007/Qtr4/log20071001.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2007/Qtr4/log20071101.zip
http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2007/Qtr4/log20071201.zip
ip              

22163
    code   count
0  200.0  755443
1  206.0    8408
2  404.0    7270
3  500.0       4
         date           mean
0  2007-03-01  188956.124924
         date   count
0  2007-03-01  771125
    code           mean
0  200.0  186502.582483
1  206.0  569889.451118
2  404.0    3451.000000
3  500.0     534.500000
  browser  count_code
7     win      758183
2     lin        6135
3     mac        4000
4     mie        2178
0     fox         518
5     opr          59
6     rim          34
1     iem          18
             cik  count
1117     64803.0   2574
120       6769.0   2351
41740  1288776.0   1790
93        5272.0   1768
2603    354204.0   1734
911      51143.0   1385
15788  1067983.0   1320
5496    831001.0   1301
8775    913414.0   1256
6967    881464.0   1175
    code           mean
1  206.0  569889.451118
0  200.0  186502.582483
2  404.0    3451.000000
3  500.0     534.500000
                    extention         mean
3062      a06-24196_1ncsr.htm  126084700.0
1930       a06-1806

ip                0
date              0
time              0
zone              0
cik               0
accession         0
extention         0
code              0
size          83598
idx               0
norefer           0
noagent           0
find              0
crawler           0
browser      309116
dtype: int64
ip           83598
date         83598
time         83598
zone         83598
cik          83598
accession    83598
extention    83598
code         83598
size         83598
idx          83598
norefer      83598
noagent      83598
find         83598
crawler      83598
browser      83598
dtype: int64
ip           0
date         0
time         0
zone         0
cik          0
accession    0
extention    0
code         0
size         0
idx          0
norefer      0
noagent      0
find         0
crawler      0
browser      0
dtype: int64
             size_mean  size_median  crawler_len
browser                                         
fox      415152.628141     101412.0        199.0
iem 

  browser  count_code
9     win     1080758
5     mie       10382
4     mac        3715
3     lin        3082
0     fox         510
7     rim          46
1     iem          27
2     iph          17
6     opr           1
8     saf           1
             cik  count
10426   923118.0  16592
37855  1219511.0  15957
34818  1204936.0   9106
54724  1314999.0   6920
2690    350066.0   6110
50298  1288776.0   4863
975      51143.0   2890
249      12927.0   2190
782      40545.0   1973
22894  1108524.0   1926
    code           mean
1  206.0  236748.446968
0  200.0   84425.245545
2  400.0    8440.211009
3  403.0    3632.000000
4  404.0    3451.000000
5  500.0     536.375000
                  extention          mean
3823   a07-15844_1posam.htm  5.447038e+07
16240       boafutureii.htm  4.009444e+07
1323   a06-13699_11ncsr.htm  3.313580e+07
78263              tape.htm  3.308466e+07
4201     a07-17329_311k.htm  2.545066e+07
26307    d21540_485bpos.htm  2.498138e+07
1828   a06-21433_4ex3d5.htm  2.4

ip                0
date              0
time              0
zone              0
cik               0
accession         0
extention         0
code              0
size         106140
idx               0
norefer           0
noagent           0
find              0
crawler           0
browser      389301
dtype: int64
ip           106140
date         106140
time         106140
zone         106140
cik          106140
accession    106140
extention    106139
code         106140
size         106140
idx          106140
norefer      106140
noagent      106140
find         106140
crawler      106140
browser      106140
dtype: int64
ip           0
date         0
time         0
zone         0
cik          0
accession    0
extention    0
code         0
size         0
idx          0
norefer      0
noagent      0
find         0
crawler      0
browser      0
dtype: int64
             size_mean  size_median  crawler_len
browser                                         
fox      288734.254808      13146.0   

74961