# Predicting breach probability

### Requirements

In [43]:
# pip install -U pandas
# pip install -U scikit-learn

### Dataset

In [44]:
import pandas as pd
from sklearn.metrics import log_loss

In [45]:
# loading the dataset and overview of its columns
dataset = pd.read_csv("./breach_dataset.tsv", sep="\t",  encoding="utf-8", 
                      header=0, index_col=None)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21527 entries, 0 to 21526
Data columns (total 81 columns):
breached                                21527 non-null int64
company_id                              21527 non-null int64
breach_date_published                   652 non-null object
date_of_incorporation                   6031 non-null object
capital                                 4864 non-null float64
cash_and_cash_equivalent                4782 non-null float64
cash_flow                               4633 non-null float64
cash_flow_operating_revenue             4461 non-null float64
collection_period                       4725 non-null float64
costs_of_goods_sold                     4682 non-null float64
credit_period                           4699 non-null float64
creditors                               4801 non-null float64
current_assets                          4825 non-null float64
current_liabilities                     4825 non-null float64
current_ratio               

#### Important columns:
* **breached** - 1 if company was breached, 0 otherwise **(the label we want to predict)**
* **company_id** - id of a publicly traded company that was breached
* **breach_date_published** - date when the breach was made public

In [46]:
# company features to use for your predictions
dataset.columns[3:]

Index([u'date_of_incorporation', u'capital', u'cash_and_cash_equivalent',
       u'cash_flow', u'cash_flow_operating_revenue', u'collection_period',
       u'costs_of_goods_sold', u'credit_period', u'creditors',
       u'current_assets', u'current_liabilities', u'current_ratio', u'debtors',
       u'depreciation_and_amortization', u'ebit_margin', u'ebitda',
       u'ebitda_margin', u'enterprise_value', u'enterprise_value_ebitda',
       u'financial_expenses', u'financial_pl', u'fixed_assets', u'gearing',
       u'gross_margin', u'gross_profit', u'intangible_fixed_assets',
       u'interest_cover', u'interest_paid', u'liquidity_ratio', u'loans',
       u'long_term_debt', u'market_cap',
       u'market_cap_cash_flow_from_operations', u'net_assets_turnover',
       u'net_current_assets', u'noncurrent_liabilities', u'number_of_branches',
       u'number_of_employees', u'number_of_shareholders',
       u'number_of_subsidiaries', u'operating_pl', u'operating_revenue',
       u'operating_reve

In [47]:
# the breaches are very sparse events
breached_count = dataset.groupby("breached").count()["company_id"]
breached_count

breached
0    20875
1      652
Name: company_id, dtype: int64

### Evaluation

In [48]:
# average breach probability as a baseline
prob = breached_count.values[1] / float(breached_count.values[0])
prob

0.031233532934131735

In [49]:
# use logarithmic loss metric to evaluate your prediction
log_loss(dataset["breached"], [prob] * len(dataset))

0.1357552241215042