In [59]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # matrix construction
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB

import pandas as pd
import json
import os

import spacy
import matplotlib
import plotly.express as px
import plotly.subplots as sp
from plotly.subplots import make_subplots

In [22]:
nlp = spacy.load("en_core_web_sm", disable=['parser','tagger', 'parser', 'ner']) 

In [17]:
# Processing Json:
# reading json: 

dataset = "dataset_full_text.json"
path_data =  "../src/dataset_full_text.json"

with open(path_data) as file:
    open_data = json.load(file)
    
df = pd.DataFrame(open_data).transpose()
df = df.reset_index(drop=True)
    


In [14]:
df

Unnamed: 0,url,mitre_domain,tactic_name,tech_name,tech_id,text
0,https://www.symantec.com/connect/blogs/shamoon...,[enterprise-attack],[impact],"[Disk Structure Wipe, Data Destruction, Disk S...","[T1487, T1485, T1561.002]",\nEndpoint Protection - Symantec Enterprise\nP...
1,http://researchcenter.paloaltonetworks.com/201...,[enterprise-attack],"[impact, defense-evasion]","[Disk Structure Wipe, Masquerade Task or Servi...","[T1487, T1036.004, T1485, T1561.002]",\nShamoon 2: Return of the Disttrack Wiper\nPr...
2,https://media.kasperskycontenthub.com/wp-conte...,[enterprise-attack],[impact],"[Disk Structure Wipe, Data Destruction, Disk S...","[T1487, T1485, T1561.002]",FROM SHAMOON TO STONEDRILL\nWipers attacking S...
3,https://unit42.paloaltonetworks.com/shamoon-3-...,[enterprise-attack],[impact],"[Disk Structure Wipe, Data Destruction, Disk S...","[T1487, T1485, T1561.002]",\nShamoon 3 Targets Oil and Gas Organization\n...
4,https://www.cybereason.com/blog/dropping-ancho...,[enterprise-attack],[credential-access],[Password Managers],[T1555.005],\nDropping Anchor: From a TrickBot Infection t...
...,...,...,...,...,...,...
894,https://www.ezautomation.net/industry-articles...,[ics-attack],"[inhibit-response-function, collection-ics]","[Manipulate I/O Image, I/O Image]","[T0835, T0877]",\nPLC Ladder Logic Basics\nSelection Guide & D...
895,https://statescoop.com/tornado-sirens-in-dalla...,[ics-attack],[impair-process-control],[Unauthorized Command Message],[T0855],\nTornado sirens in Dallas suburbs deactivated...
896,https://www.blackhat.com/docs/asia-16/material...,[ics-attack],[collection-ics],[I/O Image],[T0877],Black Hat Asia 2016: PLC-Blaster 1\nPLC-BLASTE...
897,https://dragos.com/wp-content/uploads/CRASHOVE...,[ics-attack],[initial-access-ics],[Data Historian Compromise],[T0810],"\n©2018 Dragos, Inc. All rights reserved. [Pro..."


In [18]:
df_tech = df.explode(['tech_name', 'tech_id']).reset_index(drop = True)
df_tech.head()


Unnamed: 0,url,mitre_domain,tactic_name,tech_name,tech_id,text
0,https://www.symantec.com/connect/blogs/shamoon...,[enterprise-attack],[impact],Disk Structure Wipe,T1487,\nEndpoint Protection - Symantec Enterprise\nP...
1,https://www.symantec.com/connect/blogs/shamoon...,[enterprise-attack],[impact],Data Destruction,T1485,\nEndpoint Protection - Symantec Enterprise\nP...
2,https://www.symantec.com/connect/blogs/shamoon...,[enterprise-attack],[impact],Disk Structure Wipe,T1561.002,\nEndpoint Protection - Symantec Enterprise\nP...
3,http://researchcenter.paloaltonetworks.com/201...,[enterprise-attack],"[impact, defense-evasion]",Disk Structure Wipe,T1487,\nShamoon 2: Return of the Disttrack Wiper\nPr...
4,http://researchcenter.paloaltonetworks.com/201...,[enterprise-attack],"[impact, defense-evasion]",Masquerade Task or Service,T1036.004,\nShamoon 2: Return of the Disttrack Wiper\nPr...


In [25]:
# Cleaning:
 
df_tech['text'] = df_tech['text'].apply(lambda x: x.replace('\n', ' ')) # replacing \n with spaces
#df_tech.iloc[0, 5] - visualise one report

In [26]:
#df_tech['new_col'] = df_tech['text'].apply(lambda x: nlp(x))


KeyboardInterrupt: 

In [36]:
cv = CountVectorizer(analyzer='word', stop_words='english', lowercase=False,
                        min_df=0.1) # if words used less than 0.01 % --> ignore  
data = cv.fit_transform(df_tech['text']) 
df_dtm = pd.DataFrame(data.toarray(), columns=cv.get_feature_names())




In [40]:
df_concat = pd.concat(
    [df_dtm, df_tech[['tech_id', 'tech_name']]], axis=1)

print(df_concat.shape)

(1600, 1864)


In [41]:
df_concat.head()

Unnamed: 0,00,000,01,02,03,04,05,06,07,08,...,write,writing,written,www,xml,year,years,zero,tech_id,tech_name
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,T1487,Disk Structure Wipe
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,T1485,Data Destruction
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,T1561.002,Disk Structure Wipe
3,0,1,0,0,0,0,0,0,0,0,...,1,4,0,0,0,4,1,0,T1487,Disk Structure Wipe
4,0,1,0,0,0,0,0,0,0,0,...,1,4,0,0,0,4,1,0,T1036.004,Masquerade Task or Service


In [46]:
df_agg = df_concat.groupby(['tech_name']).mean().reset_index()
tech_name = df_agg['tech_name'].unique()[0] # get first tech name

In [52]:
df_agg.loc[df_agg['tech_name'] == tech_name].iloc[:,1:].transpose().sort_values(by = 0, ascending=False)

Unnamed: 0,0
file,8.0
account,6.5
password,6.5
user,6.0
shell,3.5
...,...
attempting,0.0
attempted,0.0
attempt,0.0
attacks,0.0


In [44]:
#df_tech.loc[df_tech['tech_name'] =='/etc/passwd and /etc/shadow']

Unnamed: 0,url,mitre_domain,tactic_name,tech_name,tech_id,text
1317,https://www.tldp.org/LDP/lame/LAME/linux-admin...,[enterprise-attack],[credential-access],/etc/passwd and /etc/shadow,T1003.008,Linux Password & Shadow File FormatsLinux Admi...
1318,https://www.cyberciti.biz/faq/unix-linux-passw...,[enterprise-attack],[credential-access],/etc/passwd and /etc/shadow,T1003.008,Please Wait... | Cloudflare Please enable coo...


In [37]:
df_dtm.head() # haven't lemmatised yet ! 

Unnamed: 0,00,000,01,02,03,04,05,06,07,08,...,worldwide,worth,write,writing,written,www,xml,year,years,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,1,4,0,0,0,4,1,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,1,4,0,0,0,4,1,0


In [72]:
# Visualise frequence of words:

def plot_freq_cats(df, top_n=15):
    """Plot the frequent words used in the different categories

    Parameters
    ----------
    df : pd.DataFrame
        Dataset to get the word frequency
    top_n : int, default = 15
        number of most frequent words to show
    """
    nb_cats = df["tech_name"].nunique() # number of tech_names
    fig = make_subplots(rows=nb_cats, cols=1, vertical_spacing=0.02,
                       subplot_titles=["Word frequency for "+ cat for cat in df["tech_name"].unique()])
    
    for i, cat in enumerate(df["tech_name"].unique()):
        df_freq = (df_agg.loc[df_agg['tech_name'] == cat]
                   .iloc[:,1:].transpose().reset_index())
                  
        df_freq.columns = ['words', 'frequency']
        df_freq = (df_freq.sort_values(by = 'frequency', ascending=False)) 

                  
        fig_site = px.bar(df_freq.iloc[:top_n,:],
                      x="words", y="frequency", orientation="v",
                      width=800, height=300, title=f"Word frequency for {cat} ")
        fig.append_trace(fig_site["data"][0], row=i+1, col=1)
    
    fig.update_layout(
    autosize=False,
    width=800,
    height=4400,)
    fig.show()


In [73]:
plot_freq_cats(df_agg.iloc[:10, :])

In [74]:
# Train and test: First delete techniques less than 9 

# We fix the random state to have the same dataset in our different tests

x_train, x_test, y_train, y_test = train_test_split(df_concat.drop(["tech_name"], axis=1),
                                                    df_concat["tech_name"], test_size=0.3,
                                                    random_state=10)

naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(x_train, y_train)

y_pred = naive_bayes_classifier.predict(x_test)

IndentationError: unexpected indent (<ipython-input-74-cb97b866ee29>, line 9)