In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Privacy_Rights_Clearinghouse-Data-Breaches-Export.csv')

In [3]:
df.head()

Unnamed: 0,Date Made Public,Company,Location,Type of breach,Type of organization,Total Records,Description of incident,Information Source,Source URL
0,"October 21, 2009",Bullitt County Public Schools,"Shepherdsville, Kentucky",DISC,EDU,676,A Bullitt County Public Schools \n ...,Dataloss DB,
1,"October 21, 2009",Roane State Community College,"Harriman, Tennessee",PORT,EDU,14783,Roane State Community College \n ha...,Dataloss DB,
2,"October 15, 2009",Halifax Health,"Daytona Beach, Florida",PORT,MED,33000,A laptop computer from a Halifax \n ...,Dataloss DB,
3,"October 4, 2009",Suffolk Community College,"Selden, New York",DISC,EDU,300,Suffolk Community College has \n ag...,Dataloss DB,
4,"September 28, 2009",Penrose Hospital,"Colorado Springs, Colorado",PHYS,MED,175,Officials at Penrose Hospital bel...,Dataloss DB,


In [4]:
df.columns

Index(['Date Made Public', 'Company', 'Location', 'Type of breach',
       'Type of organization', 'Total Records', 'Description of incident',
       'Information Source', 'Source URL'],
      dtype='object')

In [5]:
dates = df['Date Made Public']
dates.head()

0      October 21, 2009
1      October 21, 2009
2      October 15, 2009
3       October 4, 2009
4    September 28, 2009
Name: Date Made Public, dtype: object

In [6]:
dates.to_csv('fechas.csv', index=False)

In [7]:
dict_months = {'January':1, 'February':2, 'March':3, 'April':4,
               'May':5, 'June':6, 'July':7, 'August':8, 'September':9,
               'October':10, 'November':11, 'December':12}

months = []
months_int = []
days = []
years = []

for d in dates:
    md = d.split(', ')[0]
    day = md.split(' ')[1]
    month = md.split(' ')[0]
    year = d.split(', ')[1]
    
    months.append(month)
    months_int.append(dict_months[month])
    days.append(int(day))
    years.append(int(year))

In [8]:
dates_new = np.array([days,months,months_int,years])
df_new = pd.DataFrame(np.transpose(dates_new), 
                      columns = ['d','m','m_i','y'])

In [9]:
df_new.head()

Unnamed: 0,d,m,m_i,y
0,21,October,10,2009
1,21,October,10,2009
2,15,October,10,2009
3,4,October,10,2009
4,28,September,9,2009


In [10]:
distinct_years = np.sort( df_new['y'].value_counts().keys() )
print(distinct_years)

['2005' '2006' '2007' '2008' '2009' '2010' '2011' '2012' '2013' '2014'
 '2015' '2016' '2017']


In [11]:
months_str = []
counts = []
mrs = [0]

f = 0
for dy in distinct_years:
    for i in range(1,13):
        inds = ((df_new['y']==dy) & (df_new['m_i']==str(i)))
        cs = df_new[inds]['m'].value_counts() 
        ks = cs.keys()
        for c,k in zip(cs,ks):
            months_str.append(k)
            counts.append(c)
            if f==0:
                f=1
            else:
                mrs.append(c-counts[-2])

In [12]:
mrs = mrs[1:]

In [14]:
counts = np.array(counts)
mrs = np.array(mrs)

In [15]:
# Mean
mean_c = np.mean(counts)
mean_m = np.mean(mrs)
ucl_c = mean_c + 2.66*mean_m
lcl_c = mean_c - 2.66*mean_m
ucl_m = 3.27*mean_m
lcl_m = 0

In [16]:
print(lcl_c, mean_c, ucl_c)
print(lcl_m, mean_m, ucl_m)

50.0551612903 50.2451612903 50.4351612903
0 0.0714285714286 0.233571428571


In [17]:
x_values = range(len(counts))

plt.figure(figsize=(50,15))
plt.plot(x_values, counts, label='Counts',lw=2)
plt.axhline(ucl_c, linestyle='--', c='g', label='UCL - LCL')
plt.axhline(lcl_c, linestyle='--', c='g')
plt.axhline(mean_c, linestyle='-.', c='c', label='Mean')
plt.xticks(x_values, months_str, rotation='vertical')
plt.legend()
plt.xlabel('MONTH (2005 - 2017)')
plt.ylabel('COUNT VALUES')
plt.savefig('2_CV_mean.png', bbox_inches='tight')

x_values = range(len(mrs))

plt.figure(figsize=(50,15))
plt.plot(x_values, mrs, label='Moving Range',lw=2)
plt.axhline(ucl_m, linestyle='--', c='g', label='UCL - LCL')
plt.axhline(lcl_m, linestyle='--', c='g')
plt.axhline(mean_m, linestyle='-.', c='c', label='Mean')
plt.xticks(x_values, months_str[1:], rotation='vertical')
plt.legend()
plt.xlabel('MONTH (2005 - 2017)')
plt.ylabel('MOVING RANGE')
plt.savefig('2_MR_mean.png', bbox_inches='tight')

In [18]:
# Median
median_c = np.median(counts)
median_m = np.median(mrs)
ucl_c = median_c + 2.66*median_m
lcl_c = median_c - 2.66*median_m
ucl_m = 3.27*median_m
lcl_m = 0

In [20]:
print(lcl_c, median_c, ucl_c)
print(lcl_m, median_m, ucl_m)

52.0 52.0 52.0
0 0.0 0.0


In [19]:
x_values = range(len(counts))

plt.figure(figsize=(50,15))
plt.plot(x_values, counts, label='Counts',lw=2)
plt.axhline(ucl_c, linestyle='--', c='g', label='UCL - LCL')
plt.axhline(lcl_c, linestyle='--', c='g')
plt.axhline(median_c, linestyle='-.', c='c', label='Median')
plt.xticks(x_values, months_str, rotation='vertical')
plt.legend()
plt.xlabel('MONTH (2005 - 2017)')
plt.ylabel('COUNT VALUES')
plt.savefig('2_CV_median.png', bbox_inches='tight')

x_values = range(len(mrs))

plt.figure(figsize=(50,15))
plt.plot(x_values, mrs, label='Moving Range',lw=2)
plt.axhline(ucl_m, linestyle='--', c='g', label='UCL - LCL')
plt.axhline(lcl_m, linestyle='--', c='g')
plt.axhline(median_m, linestyle='-.', c='c', label='Median')
plt.xticks(x_values, months_str[1:], rotation='vertical')
plt.legend()
plt.xlabel('MONTH (2005 - 2017)')
plt.ylabel('MOVING RANGE')
plt.savefig('2_MR_median.png', bbox_inches='tight')