In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sns
from sklearn.model_selection import cross_val_score

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob, Word

plt.style.use('fivethirtyeight')
%matplotlib inline

In [3]:
file = './data/sig_train_data.csv'

classifier = pd.read_csv(file, encoding='latin1')

In [10]:
classifier.columns = ['question_num', 'question', 'response', 'maturity',
       'misc_info', 'aup_ref', 'iso_num',
       'iso_desc', 'sig_class']

In [13]:
classifier.head()

Unnamed: 0,question_num,question,response,maturity,misc_info,aup_ref,iso_num,iso_desc,sig_class
0,A.1,Is there a risk assessment program that has be...,Yes,5.0,An enterprise risk assessment is performed ann...,A.1 IT & Infrastructure Risk Governance,5.1 6.1.2,"Leadership & Commitment, Information Security ...",Risk Assessment
1,A.1.1,"A risk assessment, conducted within the last 1...",Yes,,,A.2 IT & Infrastructure Risk Assessment Life C...,8.2,Information security risk assessment,Risk Assessment
2,A.1.2,Risk Governance?,Yes,,,A.1 IT & Infrastructure Risk Governance,,,Risk Assessment
3,A.1.3,"Range of assets to include: people, processes,...",Yes,,,A.1 IT & Infrastructure Risk Governance,,,Risk Assessment
4,A.1.4,"Range of threats to include: malicious, natura...",Yes,,,A.1 IT & Infrastructure Risk Governance,6.1.2,Information Security Risk Assessment,Risk Assessment


In [14]:
classifier.loc[:, 'sig_class'].value_counts()

Application Security                         215
Operations Management                        120
Business Resiliency                          105
Asset and Information Management             104
Server Security                               96
Physical and Environment                      86
End User Device Security                      76
Access Control                                74
Network Security                              66
Incident Event & Communication Management     45
Risk Assessment                               43
Security Policy                               42
Threat Management                             38
Compliance                                    31
Human Resource Security                       28
Privacy                                       18
Organizational Security                       10
Name: sig_class, dtype: int64

In [15]:
classifier.loc[:, 'sig_class_num'] = classifier.loc[:, 'sig_class'].map({
    'Application Security':1,
    'Operations Management':2,
    'Business Resiliency':3,
    'Asset and Information Management':4,
    'Server Security':5,
    'Physical and Environment':6,
    'End User Device Security':7,
    'Access Control':8,
    'Network Security':9,
    'Incident Event & Communication Management':10,
    'Risk Assessment':11,
    'Security Policy':12,
    'Threat Management':13,
    'Compliance':14,
    'Human Resource Security':15,
    'Privacy':16,
    'Organizational Security':17,
})

In [16]:
classifier.loc[:, 'sig_class_num'].value_counts()

1     215
2     120
3     105
4     104
5      96
6      86
7      76
8      74
9      66
10     45
11     43
12     42
13     38
14     31
15     28
16     18
17     10
Name: sig_class_num, dtype: int64

In [100]:
classifier

Unnamed: 0,question_num,question,response,maturity,misc_info,aup_ref,iso_num,iso_desc,sig_class,sig_class_num
0,A.1,Is there a risk assessment program that has be...,Yes,5.0,An enterprise risk assessment is performed ann...,A.1 IT & Infrastructure Risk Governance,5.1 6.1.2,"Leadership & Commitment, Information Security ...",Risk Assessment,11
1,A.1.1,"A risk assessment, conducted within the last 1...",Yes,,,A.2 IT & Infrastructure Risk Assessment Life C...,8.2,Information security risk assessment,Risk Assessment,11
2,A.1.2,Risk Governance?,Yes,,,A.1 IT & Infrastructure Risk Governance,,,Risk Assessment,11
3,A.1.3,"Range of assets to include: people, processes,...",Yes,,,A.1 IT & Infrastructure Risk Governance,,,Risk Assessment,11
4,A.1.4,"Range of threats to include: malicious, natura...",Yes,,,A.1 IT & Infrastructure Risk Governance,6.1.2,Information Security Risk Assessment,Risk Assessment,11
5,A.1.5,Risk scoping?,Yes,,,A.1 IT & Infrastructure Risk Governance,6.1.2,Information Security Risk Assessment,Risk Assessment,11
6,A.1.6,Risk context?,Yes,,,A.1 IT & Infrastructure Risk Governance,6.1.2,Information Security Risk Assessment,Risk Assessment,11
7,A.1.7,Risk training plan?,Yes,,,A.1 IT & Infrastructure Risk Governance,6.1.2,Information Security Risk Assessment,Risk Assessment,11
8,A.1.8,Risk evaluation criteria?,Yes,,,A.1 IT & Infrastructure Risk Governance,6.1.2,Information Security Risk Assessment,Risk Assessment,11
9,A.1.9,"Risk scenarios? If yes, do they include:",Yes,,,A.1 IT & Infrastructure Risk Governance,6.1.2,Information Security Risk Assessment,Risk Assessment,11


In [102]:
#play around with converting NaN, and combining multiple columns to create
# master_text_blog

classifier.loc[:,'question'] + classifier.loc[:,'aup_ref']

0       Is there a risk assessment program that has be...
1       A risk assessment, conducted within the last 1...
2       Risk Governance?A.1 IT & Infrastructure Risk G...
3       Range of assets to include: people, processes,...
4       Range of threats to include: malicious, natura...
5       Risk scoping?A.1 IT & Infrastructure Risk Gove...
6       Risk context?A.1 IT & Infrastructure Risk Gove...
7       Risk training plan?A.1 IT & Infrastructure Ris...
8       Risk evaluation criteria?A.1 IT & Infrastructu...
9       Risk scenarios? If yes, do they include:A.1 IT...
10      Events and possible threats that could impact ...
11      Threat types?A.1 IT & Infrastructure Risk Gove...
12      Ownership, action plan, response plan, managem...
13      Is there a program to manage the treatment of ...
14      A formal process for assigning appropriate man...
15      A formal process for appropriate management kn...
16      A formal process for tracking the status of ac...
17      Contro

In [23]:
X = classifier.loc[:, 'question']
y = classifier.loc[:, 'sig_class_num']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# EDA on Question Text

In [26]:
vect = CountVectorizer()
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [27]:
X_train_dtm

<897x1586 sparse matrix of type '<class 'numpy.int64'>'
	with 8406 stored elements in Compressed Sparse Row format>

In [28]:
X_train_dtm.todense().shape

(897, 1586)

In [29]:
X_train_dtm.todense().sum(axis=0)

matrix([[1, 6, 1, ..., 6, 1, 1]])

In [30]:
vect.get_feature_names()

['10',
 '12',
 '120',
 '140',
 '15',
 '180',
 '2048',
 '21',
 '24',
 '24x7x365',
 '25',
 '30',
 '40',
 '443',
 '60',
 '80',
 '800',
 '88',
 '90',
 'ability',
 'able',
 'about',
 'above',
 'abuse',
 'acceptable',
 'acceptance',
 'accepting',
 'access',
 'accessed',
 'accessible',
 'accessing',
 'accidental',
 'accordance',
 'according',
 'account',
 'accounts',
 'accuracy',
 'acquisition',
 'acted',
 'action',
 'actioned',
 'actions',
 'activate',
 'activating',
 'active',
 'activex',
 'activities',
 'activity',
 'ad',
 'addition',
 'additional',
 'address',
 'addressed',
 'addresses',
 'addressing',
 'adequacy',
 'admin',
 'administration',
 'administrative',
 'administrator',
 'administrators',
 'advertising',
 'affect',
 'affected',
 'after',
 'against',
 'agreed',
 'agreement',
 'agreements',
 'ajax',
 'alarm',
 'alarmed',
 'alert',
 'alerting',
 'alerts',
 'all',
 'allocated',
 'allow',
 'allowed',
 'allows',
 'alone',
 'along',
 'alternate',
 'amend',
 'an',
 'analysis',
 'analyze

In [31]:
vect.vocabulary_

{'are': 114,
 'apis': 97,
 'tested': 1412,
 'for': 582,
 'security': 1266,
 'weaknesses': 1548,
 'if': 673,
 'yes': 1581,
 'does': 449,
 'this': 1424,
 'include': 694,
 'is': 755,
 'scoped': 1257,
 'systems': 1382,
 'and': 88,
 'data': 361,
 'ever': 519,
 'used': 1507,
 'in': 685,
 'the': 1417,
 'test': 1411,
 'development': 408,
 'or': 958,
 'qa': 1119,
 'environments': 502,
 'software': 1321,
 'program': 1089,
 'bouncycastle': 196,
 'openssl': 949,
 'prior': 1067,
 'to': 1437,
 'device': 409,
 'on': 943,
 'boarding': 193,
 'constituents': 305,
 'required': 1195,
 'sign': 1302,
 'legal': 792,
 'agreement': 67,
 'which': 1557,
 'details': 397,
 'obligations': 932,
 'rights': 1229,
 'related': 1164,
 'mobile': 879,
 'devices': 410,
 'it': 762,
 'errors': 505,
 'resulting': 1214,
 'from': 594,
 'incomplete': 697,
 'inaccurate': 686,
 'business': 207,
 'there': 1421,
 'formal': 586,
 'methodology': 870,
 'operation': 951,
 'groups': 620,
 'real': 1135,
 'time': 1432,
 'alerting': 73,
 're

# Train Test Classifier using Naive Bayes

In [32]:
#Create Document-term Matrices with CountVectorizer()
vect = CountVectorizer()
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)

#Use Naive Bayes to predict sig_class_num
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

In [33]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.41666666666666669

In [34]:
y_test.value_counts()

1     53
3     36
2     26
4     26
5     24
6     22
7     19
9     18
8     15
12    11
15    10
11     9
14     9
10     8
13     7
17     4
16     3
Name: sig_class_num, dtype: int64

In [35]:
# Define a function that accepts a vectorizer and calculates the accuracy.

def tokenize_test(vect):
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    print('Number of features: ', X_train_dtm.shape[1])
    X_test_dtm = vect.transform(X_test)
    
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))

In [37]:
vect = CountVectorizer()
tokenize_test(vect)

Number of features:  1586
Accuracy:  0.416666666667


In [48]:
vect = CountVectorizer(ngram_range=(1,2))
tokenize_test(vect)

Number of features:  6480
Accuracy:  0.403333333333


In [49]:
vect = CountVectorizer(stop_words='english')

vect.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [51]:
tokenize_test(vect)

Number of features:  1467
Accuracy:  0.443333333333


In [57]:
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)

Number of features:  1467
Accuracy:  0.443333333333


In [58]:
vect.get_feature_names()

['10',
 '12',
 '120',
 '140',
 '15',
 '180',
 '2048',
 '21',
 '24',
 '24x7x365',
 '25',
 '30',
 '40',
 '443',
 '60',
 '80',
 '800',
 '88',
 '90',
 'ability',
 'able',
 'abuse',
 'acceptable',
 'acceptance',
 'accepting',
 'access',
 'accessed',
 'accessible',
 'accessing',
 'accidental',
 'accordance',
 'according',
 'account',
 'accounts',
 'accuracy',
 'acquisition',
 'acted',
 'action',
 'actioned',
 'actions',
 'activate',
 'activating',
 'active',
 'activex',
 'activities',
 'activity',
 'ad',
 'addition',
 'additional',
 'address',
 'addressed',
 'addresses',
 'addressing',
 'adequacy',
 'admin',
 'administration',
 'administrative',
 'administrator',
 'administrators',
 'advertising',
 'affect',
 'affected',
 'agreed',
 'agreement',
 'agreements',
 'ajax',
 'alarm',
 'alarmed',
 'alert',
 'alerting',
 'alerts',
 'allocated',
 'allow',
 'allowed',
 'allows',
 'alternate',
 'amend',
 'analysis',
 'analyze',
 'analyzed',
 'annual',
 'annually',
 'anti',
 'antivirus',
 'anytime',
 '

In [70]:
vect = CountVectorizer(stop_words='english', ngram_range=(1,1), min_df=1)
tokenize_test(vect)

Number of features:  1467
Accuracy:  0.443333333333


# Textblob

In [76]:
question = TextBlob(classifier.loc[0, 'question'])

In [77]:
question

TextBlob("Is there a risk assessment program that has been approved by management, communicated to constituents and an owner to maintain and review the program? if yes, does it include:")

In [78]:
stemmer = SnowballStemmer('english')

In [79]:
question.words

WordList(['Is', 'there', 'a', 'risk', 'assessment', 'program', 'that', 'has', 'been', 'approved', 'by', 'management', 'communicated', 'to', 'constituents', 'and', 'an', 'owner', 'to', 'maintain', 'and', 'review', 'the', 'program', 'if', 'yes', 'does', 'it', 'include'])

In [81]:
#Stemmer is crude form
[stemmer.stem(word) for word in question.words]

['is',
 'there',
 'a',
 'risk',
 'assess',
 'program',
 'that',
 'has',
 'been',
 'approv',
 'by',
 'manag',
 'communic',
 'to',
 'constitu',
 'and',
 'an',
 'owner',
 'to',
 'maintain',
 'and',
 'review',
 'the',
 'program',
 'if',
 'yes',
 'doe',
 'it',
 'includ']

In [84]:
print([word.lemmatize(pos='v') for word in question.words])

['Is', 'there', 'a', 'risk', 'assessment', 'program', 'that', 'have', 'be', 'approve', 'by', 'management', 'communicate', 'to', 'constituents', 'and', 'an', 'owner', 'to', 'maintain', 'and', 'review', 'the', 'program', 'if', 'yes', 'do', 'it', 'include']


In [85]:
def split_into_lemmas(text):
    text = str(text).lower()
    words = TextBlob(text).words
    return (word.lemmatize(pos='v') for word in words)

In [88]:
vect = CountVectorizer(stop_words ='english', analyzer=split_into_lemmas, decode_error='replace')
tokenize_test(vect)

Number of features:  1376
Accuracy:  0.4


In [89]:
classifier

Unnamed: 0,question_num,question,response,maturity,misc_info,aup_ref,iso_num,iso_desc,sig_class,sig_class_num
0,A.1,Is there a risk assessment program that has be...,Yes,5.0,An enterprise risk assessment is performed ann...,A.1 IT & Infrastructure Risk Governance,5.1 6.1.2,"Leadership & Commitment, Information Security ...",Risk Assessment,11
1,A.1.1,"A risk assessment, conducted within the last 1...",Yes,,,A.2 IT & Infrastructure Risk Assessment Life C...,8.2,Information security risk assessment,Risk Assessment,11
2,A.1.2,Risk Governance?,Yes,,,A.1 IT & Infrastructure Risk Governance,,,Risk Assessment,11
3,A.1.3,"Range of assets to include: people, processes,...",Yes,,,A.1 IT & Infrastructure Risk Governance,,,Risk Assessment,11
4,A.1.4,"Range of threats to include: malicious, natura...",Yes,,,A.1 IT & Infrastructure Risk Governance,6.1.2,Information Security Risk Assessment,Risk Assessment,11
5,A.1.5,Risk scoping?,Yes,,,A.1 IT & Infrastructure Risk Governance,6.1.2,Information Security Risk Assessment,Risk Assessment,11
6,A.1.6,Risk context?,Yes,,,A.1 IT & Infrastructure Risk Governance,6.1.2,Information Security Risk Assessment,Risk Assessment,11
7,A.1.7,Risk training plan?,Yes,,,A.1 IT & Infrastructure Risk Governance,6.1.2,Information Security Risk Assessment,Risk Assessment,11
8,A.1.8,Risk evaluation criteria?,Yes,,,A.1 IT & Infrastructure Risk Governance,6.1.2,Information Security Risk Assessment,Risk Assessment,11
9,A.1.9,"Risk scenarios? If yes, do they include:",Yes,,,A.1 IT & Infrastructure Risk Governance,6.1.2,Information Security Risk Assessment,Risk Assessment,11


In [92]:
#help(LogisticRegression())