# GROOT Basic Application

## Data Gathering

In [18]:
import requests
from bs4 import BeautifulSoup
import pickle #save data

# Scrapes the threat report from https://www.us-cert.gov/ncas/alerts/
def url_to_report(url):
    '''Return threat report specifically from https://www.us-cert.gov/ncas/alerts/ '''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml") #html parsing
    text = [p.text for p in soup.find(class_="region region-content").find_all(["h1","h2","h3", "h4","p","ul","table"])] 
    #search through the CISA webpage for the region region-content section (i.e. which contains the entirety of the threat report) 
    #and select the paragraphs, headers, list, etc.
    print(url)
    return text

# URL of the threat report that we want to analyze
urls = ['https://www.us-cert.gov/ncas/alerts/TA17-163A']

# Let's create an array of the number of threat reports  
number = ['1']

In [19]:
#Actually requests the threat reports from the CISA's webpage(s)
reports = [url_to_report(u) for u in urls]

https://www.us-cert.gov/ncas/alerts/TA17-163A


In [20]:
#Make a new directory to hold the text files
!mkdir reports

for i, n in enumerate(number):
    with open("reports/" + n + ".txt", "wb") as file:
        pickle.dump(reports[i], file)

mkdir: reports: File exists


In [21]:
#Load pickled files
data = {}
for i, n in enumerate(number):
    with open("reports/" + n + ".txt", "rb") as file:
        data[n] = pickle.load(file)

In [22]:
#Double check to make sure the data has been loaded properly
#data.keys()

In [23]:
#Checking (again) this time to the see the data's content
#data['1'][:99]

In [24]:
#Let's take a look at the data again
#next(iter(data.keys()))

In [25]:
#Notice that our dictionary is currently in key: number, value: text from report
#next(iter(data.values()))

In [26]:
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [27]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key,value) in data.items()}

In [28]:
#Creating the database using the pandas library
import pandas as pd
pd.set_option('max_colwidth', 150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['reports']
data_df = data_df.sort_index()
data_df

Unnamed: 0,reports
1,Alert (TA17-163A) CrashOverride Malware Systems Affected Industrial Control Systems Overview The National Cybersecurity and Communications Integra...


In [29]:
data_df.reports.loc['1'] #View the entirety of the data scraped from the threat report (i.e stored in the database)

'Alert (TA17-163A) CrashOverride Malware Systems Affected Industrial Control Systems Overview The National Cybersecurity and Communications Integration Center (NCCIC) is aware of public reports from ESET and Dragos outlining a new, highly capable Industrial Controls Systems (ICS) attack platform that was reportedly used in 2016 against critical infrastructure in Ukraine. As reported by ESET and Dragos, the CrashOverride malware is an extensible platform that could be used to target critical infrastructure sectors. NCCIC is working with its partners to validate the ESET and Dragos analysis, and develop a better understanding of the risk this new malware poses to U.S. critical infrastructure. Although this activity is still under investigation, NCCIC is sharing this report to provide organizations with detection and mitigation recommendations to help prevent future compromises within their critical infrastructure networks. NCCIC continues to work with interagency and international partne

### Manual Extraction

In [13]:
#This command searchs for the existence of email addresses in the dataset
data_df.reports.str.findall('[A-Za-z0-9_]+@[A-Za-z0-9_]+(?:.dhs.gov|.gov|.com)')

1    [NCCICCustomerService@hq.dhs.gov]
Name: reports, dtype: object

In [14]:
#This line should extract all of the file extensions within the report
data_df.reports.str.findall('[A-Za-z0-9_]*(?:.dll|.exe)+')

1    [malware exe, dragos_crashoverride_exporting_dll, 101.dll, Crash101.dll, 104.dll, Crash104.dll, 61850.dll, Crash61850.dll, OPCClientDemo.dll, Cras...
Name: reports, dtype: object

## Data Cleaning

In [36]:
import re
import string 

def clean_text_round1(text):
   '''Make text lowercase, remove text in square brackets, remove punctuation, and remove words with numbers''' 
   text = text.lower() #transform text to lowercase; we can add additional functions to further clean the data later on if necessary
   #text = re.sub('\[.*?\]','', text) #remove punctuation
   text = re.sub('[%s]'% re.escape(string.punctuation),'',text)
   text = re.sub('\w*\d\w*','', text) #remove text with numbers
   return text

round1 = lambda x: clean_text_round1(x)

In [37]:
#Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.reports.apply(round1))
data_clean

Unnamed: 0,reports
1,alert crashoverride malware systems affected industrial control systems overview the national cybersecurity and communications integration center...


In [44]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    #text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [45]:
#Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.reports.apply(round2))
data_clean

Unnamed: 0,reports
1,alert crashoverride malware systems affected industrial control systems overview the national cybersecurity and communications integration center...


In [46]:
#Let's pickle the file for later use
data_clean.to_pickle("corpus.pkl")

## Entity and Relationship Extraction

Using regular expressions and following the Applied Text Mining Coursea course, we were able to extract an email address, hash, and file extensions

### Natural Language Toolkit (NLTK)

Using the NLTK, we attempted to extract entities as well as their relationships from the threat reports. Before applying this process we may want to remove 

In [32]:
import nltk
from nltk.tokenize import word_tokenize

tokenize = word_tokenize(data_clean.reports.loc['1'])
tokenize

['alert',
 'crashoverride',
 'malware',
 'systems',
 'affected',
 'industrial',
 'control',
 'systems',
 'overview',
 'the',
 'national',
 'cybersecurity',
 'and',
 'communications',
 'integration',
 'center',
 'nccic',
 'is',
 'aware',
 'of',
 'public',
 'reports',
 'from',
 'eset',
 'and',
 'dragos',
 'outlining',
 'a',
 'new',
 'highly',
 'capable',
 'industrial',
 'controls',
 'systems',
 'ics',
 'attack',
 'platform',
 'that',
 'was',
 'reportedly',
 'used',
 'in',
 'against',
 'critical',
 'infrastructure',
 'in',
 'ukraine',
 'as',
 'reported',
 'by',
 'eset',
 'and',
 'dragos',
 'the',
 'crashoverride',
 'malware',
 'is',
 'an',
 'extensible',
 'platform',
 'that',
 'could',
 'be',
 'used',
 'to',
 'target',
 'critical',
 'infrastructure',
 'sectors',
 'nccic',
 'is',
 'working',
 'with',
 'its',
 'partners',
 'to',
 'validate',
 'the',
 'eset',
 'and',
 'dragos',
 'analysis',
 'and',
 'develop',
 'a',
 'better',
 'understanding',
 'of',
 'the',
 'risk',
 'this',
 'new',
 'malw

In [46]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) #stop words in the english language
#stop_words

#remove stop words
filtered = [w for w in tokenize if not w in stop_words]
filtered

['alert',
 'crashoverride',
 'malware',
 'systems',
 'affected',
 'industrial',
 'control',
 'systems',
 'overview',
 'national',
 'cybersecurity',
 'communications',
 'integration',
 'center',
 'nccic',
 'aware',
 'public',
 'reports',
 'eset',
 'dragos',
 'outlining',
 'new',
 'highly',
 'capable',
 'industrial',
 'controls',
 'systems',
 'ics',
 'attack',
 'platform',
 'reportedly',
 'used',
 'critical',
 'infrastructure',
 'ukraine',
 'reported',
 'eset',
 'dragos',
 'crashoverride',
 'malware',
 'extensible',
 'platform',
 'could',
 'used',
 'target',
 'critical',
 'infrastructure',
 'sectors',
 'nccic',
 'working',
 'partners',
 'validate',
 'eset',
 'dragos',
 'analysis',
 'develop',
 'better',
 'understanding',
 'risk',
 'new',
 'malware',
 'poses',
 'us',
 'critical',
 'infrastructure',
 'although',
 'activity',
 'still',
 'investigation',
 'nccic',
 'sharing',
 'report',
 'provide',
 'organizations',
 'detection',
 'mitigation',
 'recommendations',
 'help',
 'prevent',
 'futu

In [None]:
from collection import Counter

top = [word for ]

In [48]:
#Part of Speech (POS) Tagging
#POS tag list:
'''
CC	coordinating conjunction
CD	cardinal digit
DT	determiner
EX	existential there (like: "there is" ... think of it like "there exists")
FW	foreign word
IN	preposition/subordinating conjunction
JJ	adjective	'big'
JJR	adjective, comparative	'bigger'
JJS	adjective, superlative	'biggest'
LS	list marker	1)
MD	modal	could, will
NN	noun, singular 'desk'
NNS	noun plural	'desks'
NNP	proper noun, singular	'Harrison'
NNPS	proper noun, plural	'Americans'
PDT	predeterminer	'all the kids'
POS	possessive ending	parent\'s
PRP	personal pronoun	I, he, she
PRP$	possessive pronoun	my, his, hers
RB	adverb	very, silently,
RBR	adverb, comparative	better
RBS	adverb, superlative	best
RP	particle	give up
TO	to	go 'to' the store.
UH	interjection	errrrrrrrm
VB	verb, base form	take
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when
'''
tagged = nltk.pos_tag(filtered)
print(tagged)

[('alert', 'JJ'), ('crashoverride', 'NN'), ('malware', 'NN'), ('systems', 'NNS'), ('affected', 'VBD'), ('industrial', 'JJ'), ('control', 'NN'), ('systems', 'NNS'), ('overview', 'VBP'), ('national', 'JJ'), ('cybersecurity', 'NN'), ('communications', 'NNS'), ('integration', 'NN'), ('center', 'NN'), ('nccic', 'NN'), ('aware', 'JJ'), ('public', 'JJ'), ('reports', 'NNS'), ('eset', 'VBD'), ('dragos', 'NNS'), ('outlining', 'VBG'), ('new', 'JJ'), ('highly', 'RB'), ('capable', 'JJ'), ('industrial', 'JJ'), ('controls', 'NNS'), ('systems', 'NNS'), ('ics', 'NNS'), ('attack', 'VBP'), ('platform', 'NN'), ('reportedly', 'RB'), ('used', 'VBD'), ('critical', 'JJ'), ('infrastructure', 'NN'), ('ukraine', 'NN'), ('reported', 'VBD'), ('eset', 'JJ'), ('dragos', 'NNS'), ('crashoverride', 'VBP'), ('malware', 'NN'), ('extensible', 'JJ'), ('platform', 'NN'), ('could', 'MD'), ('used', 'VBN'), ('target', 'VB'), ('critical', 'JJ'), ('infrastructure', 'NN'), ('sectors', 'NNS'), ('nccic', 'VBP'), ('working', 'VBG'),

In [51]:
namedEnt = nltk.ne_chunk(tagged, binary=True)
namedEnt.draw()

KeyboardInterrupt: 