# GROOT Basic Application

## Data Gathering

In [None]:
import requests
from bs4 import BeautifulSoup
import pickle #save data

# Scrapes the threat report from https://www.us-cert.gov/ncas/alerts/
def url_to_report(url):
    '''Return threat report specifically from https://www.us-cert.gov/ncas/alerts/ '''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml") #html parsing
    text = [p.text for p in soup.find(class_="region region-content").find_all(["h1","h2","h3", "h4","p","ul","table"])] 
    #search through the CISA webpage for the region region-content section (i.e. which contains the entirety of the threat report) 
    #and select the paragraphs, headers, list, etc.
    print(url)
    return text

# URL of the threat report that we want to analyze
urls = ['https://www.us-cert.gov/ncas/alerts/TA17-163A']

# Let's create an array of the number of threat reports  
number = ['1']

In [2]:
#Actually requests the threat reports from the CISA's webpage(s)
reports = [url_to_report(u) for u in urls]

https://www.us-cert.gov/ncas/alerts/TA17-163A


In [3]:
#Make a new directory to hold the text files
!mkdir reports

for i, n in enumerate(number):
    with open("reports/" + n + ".txt", "wb") as file:
        pickle.dump(reports[i], file)

In [4]:
#Load pickled files
data = {}
for i, n in enumerate(number):
    with open("reports/" + n + ".txt", "rb") as file:
        data[n] = pickle.load(file)

In [5]:
#Double check to make sure the data has been loaded properly
#data.keys()

In [6]:
#Checking (again) this time to the see the data's content
#data['1'][:99]

In [7]:
#Let's take a look at the data again
#next(iter(data.keys()))

In [8]:
#Notice that our dictionary is currently in key: number, value: text from report
#next(iter(data.values()))

In [9]:
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [10]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key,value) in data.items()}

In [11]:
#Creating the database using the pandas library
import pandas as pd
pd.set_option('max_colwidth', 150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['reports']
data_df = data_df.sort_index()
data_df

Unnamed: 0,reports
1,Alert (TA17-163A) CrashOverride Malware Systems Affected Industrial Control Systems Overview The National Cybersecurity and Communications Integra...


In [12]:
data_df.reports.loc['1'] #View the entirety of the data scraped from the threat report (i.e stored in the database)

'Alert (TA17-163A) CrashOverride Malware Systems Affected Industrial Control Systems Overview The National Cybersecurity and Communications Integration Center (NCCIC) is aware of public reports from ESET and Dragos outlining a new, highly capable Industrial Controls Systems (ICS) attack platform that was reportedly used in 2016 against critical infrastructure in Ukraine. As reported by ESET and Dragos, the CrashOverride malware is an extensible platform that could be used to target critical infrastructure sectors. NCCIC is working with its partners to validate the ESET and Dragos analysis, and develop a better understanding of the risk this new malware poses to U.S. critical infrastructure. Although this activity is still under investigation, NCCIC is sharing this report to provide organizations with detection and mitigation recommendations to help prevent future compromises within their critical infrastructure networks. NCCIC continues to work with interagency and international partne

## Entity and Relationship Extraction

### Manual Extraction

Using regular expressions and following the Applied Text Mining Coursea course, we were able to extract an email address, hash, and file extensions

In [14]:
#This command searchs for the existence of email addresses in the dataset
data_df.reports.str.findall('[A-Za-z0-9_]+@[A-Za-z0-9_]+(?:.dhs.gov|.gov|.com)')

1    [NCCICCustomerService@hq.dhs.gov]
Name: reports, dtype: object

In [24]:
#This line should extract all of the file extensions within the report
data_df.reports.str.findall('[A-Za-z0-9_]*(?:.dll|.exe)+')

1    [malware exe, dragos_crashoverride_exporting_dll, 101.dll, Crash101.dll, 104.dll, Crash104.dll, 61850.dll, Crash61850.dll, OPCClientDemo.dll, Cras...
Name: reports, dtype: object