In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # matrix construction
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB

import pandas as pd
import json
import os

import spacy
import matplotlib
import plotly.express as px
import plotly.subplots as sp
from plotly.subplots import make_subplots
from ast import literal_eval

import sklearn.metrics
import numpy as np

In [2]:
nlp = spacy.load("en_core_web_sm", disable=['parser','tagger', 'parser', 'ner']) 

# Opening Files: 

In [54]:
# Processing Json:
# reading json: 

dataset1 = "dataset_full_text.json"
path_data1 =  "../src/dataset_full_text.json"

dataset2 = "dataset_malware_text.json"
path_data2 =  "../src/dataset_malware_text.json"

with open(path_data1) as file:
    open_data = json.load(file)

with open(path_data2) as file: 
    open_data2 = json.load(file)

# Converting to Data Frames: 
    
df1 = pd.DataFrame(open_data).transpose().reset_index(drop = True)

df2 = pd.DataFrame(open_data2).transpose().reset_index(drop = False)
df2.columns = ['url', 'mitre_domain', 'tech_name', 'tech_id', 'software_id', 'text'] # renaming columns 



# Merging our datasets:

In [167]:
df = pd.concat([df1, df2], axis = 0)  # Create one Data Frame with both dataset1 and 2

# Cleaning: 

In [168]:
# Cleaning NAs in text: 

df['tactic_name'] = df['tactic_name'].fillna("").apply(list) 
df['software_id'] = df['software_id'].fillna("").apply(list)

In [169]:
df = df[df['text'] != '\n']

In [171]:
# Cleaning duplicates: 

dup = df[df.duplicated(subset='text')]

In [172]:
df_no_dup = df.drop_duplicates(subset='text').reset_index(drop=True)

In [174]:
df_no_dup[df_no_dup['text'].isin(dup['text'])].sort_values(by='text')

Unnamed: 0,url,mitre_domain,tactic_name,tech_name,tech_id,text,software_id
996,http://www.trendmicro.com/cloud-content/us/pdf...,"[Enterprise, Enterprise, Enterprise, Enterprise]",[],"[Application Layer Protocol: Web Protocols, Bo...","[T1071, T1547, T1005, T1573, T1105, T1106, T15...",\n#1 in Cloud Security & Endpoint Cybersecurit...,[S0011]
302,https://www.trendmicro.de/cloud-content/us/pdf...,"[enterprise-attack, enterprise-attack]","[defense-evasion, execution]","[Control Panel, Rundll32, Control Panel Items,...","[T1218.002, T1085, T1196, T1218.011, T1553.004...",\n#1 in Cloud Security & Endpoint Cybersecurit...,[]
33,https://web.archive.org/web/20210825130434/htt...,"[enterprise-attack, Enterprise]",[collection],"[Browser Session Hijacking, Abuse Elevation Co...","[T1185, T1548, T1134, T1071, T1185, T1059, T10...","\n3.8\tManual\tStrategic\tCyber\tLLC\n,www.cob...",[]
1028,http://blog.trendmicro.com/trendlabs-security-...,"[Enterprise, Enterprise, Enterprise]",[],"[Pre-OS Boot: System Firmware, Rootkit, Comman...","[T1542, T1014, T1059, T1095, T1014, T1205, T10...",\n404\ndismiss\nAlerts\nNo new notifications a...,[S0047]
303,https://blog.trendmicro.com/trendlabs-security...,"[enterprise-attack, enterprise-attack, enterpr...","[defense-evasion, execution]","[Control Panel, Control Panel Items, File Dele...","[T1218.002, T1196, T1107, T1090.002, T1090, T1...",\n404\ndismiss\nAlerts\nNo new notifications a...,[]
...,...,...,...,...,...,...,...
1115,http://research.zscaler.com/2015/08/chinese-cy...,"[Enterprise, Enterprise]",[],[Boot or Logon Autostart Execution: Registry R...,"[T1547, T1083, T1574, T1070, T1036, T1547, T10...",Security Research | Zscaler Skip to main conte...,[S0070]
81,https://www.wired.com/images_blogs/threatlevel...,"[enterprise-attack, Enterprise]","[persistence, privilege-escalation]","[Windows Service, Access Token Manipulation: T...","[T1543.003, T1134, T1087, T1071, T1560, T1547,...",Security Response\nContents\nIntroduction\n......,[]
8,https://web.archive.org/web/20190717233006/htt...,"[enterprise-attack, Enterprise]",[initial-access],"[Supply Chain Compromise, Obfuscated Files or ...","[T1195, T1027, T1027, T1027, T1027, T1027, T1027]","Security Response Overview\nIn 2009, Google wa...",[]
5,https://www.welivesecurity.com/wp-content/uplo...,"[enterprise-attack, Enterprise]",[persistence],"[Transport Agent, Application Layer Protocol: ...","[T1505.002, T1071, T1560, T1119, T1020, T1059,...",TURLA\nLIGHTNEURON\nOne email away from\nremot...,[]


In [175]:
dup.sort_values(by='text')

Unnamed: 0,url,mitre_domain,tactic_name,tech_name,tech_id,text,software_id
255,https://www.trendmicro.de/cloud-content/us/pdf...,[Enterprise],[],"[Application Layer Protocol: Web Protocols, Bo...","[T1071, T1547, T1059, T1132, T1005, T1083, T15...",\n#1 in Cloud Security & Endpoint Cybersecurit...,[S0015]
456,https://www.trendmicro.de/cloud-content/us/pdf...,[Enterprise],[],"[Application Layer Protocol: Web Protocols, De...","[T1071, T1140, T1105, T1027, T1518, T1082]",\n#1 in Cloud Security & Endpoint Cybersecurit...,[S0388]
742,http://www.trendmicro.com/cloud-content/us/pdf...,[Enterprise],[],"[Archive Collected Data, Encrypted Channel: Sy...","[T1560, T1573]",\n#1 in Cloud Security & Endpoint Cybersecurit...,[S0010]
647,http://www.trendmicro.com/cloud-content/us/pdf...,[enterprise-attack],"[defense-evasion, credential-access]","[Install Root Certificate, Install Root Certif...","[T1553.004, T1130, T1111]",\n#1 in Cloud Security & Endpoint Cybersecurit...,[]
301,https://web.archive.org/web/20210825130434/htt...,[Enterprise],[],[Abuse Elevation Control Mechanism: Bypass Use...,"[T1548, T1134, T1071, T1185, T1059, T1030, T10...","\n3.8\tManual\tStrategic\tCyber\tLLC\n,www.cob...",[S0154]
...,...,...,...,...,...,...,...
701,http://research.zscaler.com/2016/01/malicious-...,[Enterprise],[],[Boot or Logon Autostart Execution: Registry R...,"[T1547, T1059, T1083, T1562, T1105, T1056, T10...",Security Research | Zscaler Skip to main conte...,[S0088]
79,https://www.wired.com/images_blogs/threatlevel...,[Enterprise],[],[Access Token Manipulation: Token Impersonatio...,"[T1134, T1087, T1071, T1560, T1547, T1543, T11...",Security Response\nContents\nIntroduction\n......,[S0603]
562,https://web.archive.org/web/20190717233006/htt...,[Enterprise],[],[Obfuscated Files or Information],[T1027],"Security Response Overview\nIn 2009, Google wa...",[S0203]
754,https://www.welivesecurity.com/wp-content/uplo...,[Enterprise],[],"[Application Layer Protocol: Mail Protocols, A...","[T1071, T1560, T1119, T1020, T1059, T1005, T15...",TURLA\nLIGHTNEURON\nOne email away from\nremot...,[S0395]


In [178]:
for _, row in dup.iterrows():
    row_id = df_no_dup[df_no_dup['text'] == row['text']].index[0]
    for col in ['mitre_domain', 'tech_id', 'tech_name', 'software_id', 'tactic_name']:
        merged_list = df_no_dup.loc[row_id, col]
        for item in row[col]:
            if item not in merged_list:
                merged_list.append(item)
        

In [179]:
df_no_dup[df_no_dup['text'].isin(dup['text'])].sort_values(by='text')

Unnamed: 0,url,mitre_domain,tactic_name,tech_name,tech_id,text,software_id
996,http://www.trendmicro.com/cloud-content/us/pdf...,"[Enterprise, Enterprise, Enterprise, Enterprise]",[],"[Application Layer Protocol: Web Protocols, Bo...","[T1071, T1547, T1005, T1573, T1105, T1106, T15...",\n#1 in Cloud Security & Endpoint Cybersecurit...,"[S0011, S0015, S0388, S0010]"
302,https://www.trendmicro.de/cloud-content/us/pdf...,"[enterprise-attack, enterprise-attack]","[defense-evasion, execution, credential-access]","[Control Panel, Rundll32, Control Panel Items,...","[T1218.002, T1085, T1196, T1218.011, T1553.004...",\n#1 in Cloud Security & Endpoint Cybersecurit...,[]
33,https://web.archive.org/web/20210825130434/htt...,"[enterprise-attack, Enterprise]",[collection],"[Browser Session Hijacking, Abuse Elevation Co...","[T1185, T1548, T1134, T1071, T1185, T1059, T10...","\n3.8\tManual\tStrategic\tCyber\tLLC\n,www.cob...",[S0154]
1028,http://blog.trendmicro.com/trendlabs-security-...,"[Enterprise, Enterprise, Enterprise]",[],"[Pre-OS Boot: System Firmware, Rootkit, Comman...","[T1542, T1014, T1059, T1095, T1014, T1205, T10...",\n404\ndismiss\nAlerts\nNo new notifications a...,"[S0047, S0221, S0386]"
303,https://blog.trendmicro.com/trendlabs-security...,"[enterprise-attack, enterprise-attack, enterpr...","[defense-evasion, execution, command-and-control]","[Control Panel, Control Panel Items, File Dele...","[T1218.002, T1196, T1107, T1090.002, T1090, T1...",\n404\ndismiss\nAlerts\nNo new notifications a...,[]
...,...,...,...,...,...,...,...
1115,http://research.zscaler.com/2015/08/chinese-cy...,"[Enterprise, Enterprise]",[],[Boot or Logon Autostart Execution: Registry R...,"[T1547, T1083, T1574, T1070, T1036, T1547, T10...",Security Research | Zscaler Skip to main conte...,"[S0070, S0088]"
81,https://www.wired.com/images_blogs/threatlevel...,"[enterprise-attack, Enterprise]","[persistence, privilege-escalation]","[Windows Service, Access Token Manipulation: T...","[T1543.003, T1134, T1087, T1071, T1560, T1547,...",Security Response\nContents\nIntroduction\n......,[S0603]
8,https://web.archive.org/web/20190717233006/htt...,"[enterprise-attack, Enterprise]",[initial-access],"[Supply Chain Compromise, Obfuscated Files or ...","[T1195, T1027, T1027, T1027, T1027, T1027, T1027]","Security Response Overview\nIn 2009, Google wa...",[S0203]
5,https://www.welivesecurity.com/wp-content/uplo...,"[enterprise-attack, Enterprise]",[persistence],"[Transport Agent, Application Layer Protocol: ...","[T1505.002, T1071, T1560, T1119, T1020, T1059,...",TURLA\nLIGHTNEURON\nOne email away from\nremot...,[S0395]


In [188]:
print(df_no_dup[df_no_dup['text'].isin(dup['text'])].sort_values(by='text').iloc[0]['text'])


#1 in Cloud Security & Endpoint Cybersecurity | Trend Micro (UK)
dismiss
Alerts
No new notifications at this time.
Download
Scan Engines
All Pattern Files
All Downloads
Subscribe to Download Center RSS
Buy
Find a Partner
Home Office Online Store
Renew Online
Free Tools
Contact Sales
Locations Worldwide
+44 (0) 203 549 3300
Small Business
Buy Online
Renew Online
Region
The Americas
United States
Brasil
Canada
México
Middle East & Africa
South Africa
Middle East and North Africa
Europe
België (Belgium)
Česká Republika
Danmark
Deutschland, Österreich Schweiz
España
France
Ireland
Italia
Nederland
Norge (Norway)
Polska (Poland)
Suomi (Finland)
Sverige (Sweden)
Türkiye (Turkey)
United Kingdom
Asia & Pacific
Australia
Центральная Азия (Central Asia)
Hong Kong (English)
香港 (中文) (Hong Kong)
भारत गणराज्य (India)
Indonesia
日本 (Japan)
대한민국 (South Korea)
Malaysia
Монголия (Mongolia) and рузия (Georgia)
New Zealand
Philippines
Singapore
台灣 (Taiwan)
ประเทศไทย (Thailand)
Việt Nam
Log In
My Support
L

In [189]:
print(df_no_dup[df_no_dup['text'].isin(dup['text'])].sort_values(by='text').iloc[1]['text'])


#1 in Cloud Security & Endpoint Cybersecurity | Trend Micro (UK)
dismiss
Alerts
No new notifications at this time.
Download
Scan Engines
All Pattern Files
All Downloads
Subscribe to Download Center RSS
Buy
Find a Partner
Home Office Online Store
Renew Online
Free Tools
Contact Sales
Locations Worldwide
+44 (0) 203 549 3300
Small Business
Buy Online
Renew Online
Region
The Americas
United States
Brasil
Canada
México
Middle East & Africa
South Africa
Middle East and North Africa
Europe
België (Belgium)
Česká Republika
Danmark
Deutschland, Österreich Schweiz
España
France
Ireland
Italia
Nederland
Norge (Norway)
Polska (Poland)
Россия (Russia)
Suomi (Finland)
Sverige (Sweden)
Türkiye (Turkey)
United Kingdom
Asia Pacific
Australia
Hong Kong (English)
香港 (中文) (Hong Kong)
भारत गणराज्य (India)
Indonesia
日本 (Japan)
대한민국 (South Korea)
Malaysia
New Zealand
Philippines
Singapore
台灣 (Taiwan)
ประเทศไทย (Thailand)
Việt Nam
Log In
My Support
Log In to Support
Partner Portal
Home Solutions
My Account


# Filetering URLS: 

In [190]:
df = df_no_dup

In [194]:
def is_url_relevant(url):
    for word in ['microsoft', 'apple', 'github', 'wikipedia',
                 'support.office', 'amazon', 'gitlab', 'capec', 'docker', 'youtube', 'google', 'mitre', 'zip', 
                 'twitter']:
        if word in url:
            return False
    return True

df = df[df['url'].apply(is_url_relevant)]
df

Unnamed: 0,url,mitre_domain,tactic_name,tech_name,tech_id,text,software_id
0,https://www.symantec.com/connect/blogs/shamoon...,[enterprise-attack],[impact],"[Disk Structure Wipe, Data Destruction, Disk S...","[T1487, T1485, T1561.002, T1058, T1574.011, T1...",\nEndpoint Protection - Symantec Enterprise\nP...,[]
1,http://researchcenter.paloaltonetworks.com/201...,[enterprise-attack],"[impact, defense-evasion]","[Disk Structure Wipe, Masquerade Task or Servi...","[T1487, T1036.004, T1485, T1561.002]",\nShamoon 2: Return of the Disttrack Wiper\nPr...,[]
2,https://media.kasperskycontenthub.com/wp-conte...,"[enterprise-attack, Enterprise]",[impact],"[Disk Structure Wipe, Data Destruction, Disk S...","[T1487, T1485, T1561.002, T1059, T1485, T1070,...",FROM SHAMOON TO STONEDRILL\nWipers attacking S...,[S0380]
3,https://unit42.paloaltonetworks.com/shamoon-3-...,[enterprise-attack],[impact],"[Disk Structure Wipe, Data Destruction, Disk S...","[T1487, T1485, T1561.002]",\nShamoon 3 Targets Oil and Gas Organization\n...,[]
4,https://www.cybereason.com/blog/dropping-ancho...,[enterprise-attack],[credential-access],[Password Managers],[T1555.005],\nDropping Anchor: From a TrickBot Infection t...,[]
...,...,...,...,...,...,...,...
1553,https://securelist.com/transparent-tribe-part-...,[Enterprise],[],"[Audio Capture, Boot or Logon Autostart Execut...","[T1123, T1547, T1059, T1555, T1025, T1083, T10...","\nTransparent Tribe: Evolution analysis, part ...",[S0115]
1554,https://www.fidelissecurity.com/sites/default/...,[Enterprise],[],"[Application Layer Protocol: Web Protocols, Bo...","[T1071, T1547, T1059, T1070, T1105, T1027, T1218]",WWW.FIDELISSECURITY.COM\n©Fidelis Cybersecuri...,[S0087]
1555,https://www.fidelissecurity.com/threatgeek/arc...,[Enterprise],[],"[Encrypted Channel: Symmetric Cryptography, En...",[T1573],\nIntroducing Hi-Zor RAT - Fidelis Cybersecuri...,[S0087]
1556,https://researchcenter.paloaltonetworks.com/20...,[Enterprise],[],"[Application Layer Protocol: Web Protocols, Ar...","[T1071, T1560, T1059, T1140, T1105, T1505, T1033]",\nOilRig uses RGDoor IIS Backdoor on Targets i...,[S0258]


In [198]:
for i, row in df.iterrows():
    print('--------------')
    print(row['url'])
    print('--------------')
    print(row['text'])
    if i > 10:
        break
    

--------------
https://www.symantec.com/connect/blogs/shamoon-attacks
--------------

Endpoint Protection - Symantec Enterprise
Products
Applications
Support
Company
How To Buy
Skip to main content (Press Enter).
Sign in
Skip auxiliary navigation (Press Enter).
Register
Skip main navigation (Press Enter).
Toggle navigation
Search Options
HomeMy CommunitiesCommunities All CommunitiesEnterprise SoftwareMainframe SoftwareSymantec EnterpriseBlogs All BlogsEnterprise SoftwareMainframe SoftwareSymantec EnterpriseEvents All EventsEnterprise SoftwareMainframe SoftwareSymantec EnterpriseWater CoolerGroups Enterprise SoftwareMainframe SoftwareSymantec EnterpriseMembers
Endpoint Protection
Â View Only
Community Home
Threads
Library
Events
Members
Back to Library
The Shamoon Attacks
1
Recommend
Aug 16, 2012 11:37 AM
A L Johnson
W32.Disttrack is a new threat that is being used in specific targeted attacks against at least one organization in the energy sector.
It is a destructive malware that corru

In [199]:
# Save to csv: 

df.to_csv("../src/merged_dataset.csv", index = False)


# Formatting: 

In [14]:
df_tech = df.explode(['tech_id']).reset_index(drop = True)

In [19]:
df_for_model.head()

Unnamed: 0,00,000,0000,00000,000000,00000000,0000000000,0000000017,00000001,00000002,...,台灣,日本,日本語,香港,대한민국,한국어,ﬁle,ﬁles,ﬁrst,tech_id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,T1548
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,T1071
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,T1547
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,T1543
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,T1555


In [21]:
df_agg = df_concat.groupby(['tech_name']).mean().reset_index()
tech_name = df_agg['tech_name'].unique()[0] # get first tech name

NameError: name 'df_concat' is not defined

In [20]:
df_agg.loc[df_agg['tech_name'] == tech_name].iloc[:,1:].transpose().sort_values(by = 0, ascending=False)

NameError: name 'df_agg' is not defined

# Add Tactic to dataset:

In [22]:
df = pd.read_csv('merged_dataset_noMalwareNames.csv')

In [23]:
df

Unnamed: 0,url,mitre_domain,tactic_name,tech_name,tech_id,text,software_id
0,https://www.symantec.com/connect/blogs/shamoon...,['enterprise-attack'],['impact'],"['Disk Structure Wipe', 'Data Destruction', 'D...","['T1487', 'T1485', 'T1561', 'T1058', 'T1574', ...",Endpoint Protection Symantec Enterprise Pro...,[]
1,http://researchcenter.paloaltonetworks.com/201...,['enterprise-attack'],"['impact', 'defense-evasion']","['Disk Structure Wipe', 'Masquerade Task or Se...","['T1487', 'T1036', 'T1485', 'T1561']",MALWARE_NAME Return of the MALWARE_NAME...,[]
2,https://media.kasperskycontenthub.com/wp-conte...,"['enterprise-attack', 'Enterprise']",['impact'],"['Disk Structure Wipe', 'Data Destruction', 'D...","['T1487', 'T1485', 'T1561', 'T1059', 'T1485', ...",FROM MALWARE_NAME TO MALWARE_NAME Wipers a...,['S0380']
3,https://unit42.paloaltonetworks.com/shamoon-3-...,['enterprise-attack'],['impact'],"['Disk Structure Wipe', 'Data Destruction', 'D...","['T1487', 'T1485', 'T1561']",MALWARE_NAME Targets Oil and Gas Organiza...,[]
4,https://www.cybereason.com/blog/dropping-ancho...,['enterprise-attack'],['credential-access'],['Password Managers'],['T1555'],Dropping MALWARE_NAME From a MALWARE_NAME...,[]
...,...,...,...,...,...,...,...
1515,https://securelist.com/transparent-tribe-part-...,['Enterprise'],[],"['Audio Capture', 'Boot or Logon Autostart Exe...","['T1123', 'T1547', 'T1059', 'T1555', 'T1025', ...",Transparent Tribe Evolution analysis part ...,['S0115']
1516,https://www.fidelissecurity.com/sites/default/...,['Enterprise'],[],"['Application Layer Protocol: Web Protocols', ...","['T1071', 'T1547', 'T1059', 'T1070', 'T1105', ...",WWW FIDELISSECURITY COM Fidelis Cybersecurit...,['S0087']
1517,https://www.fidelissecurity.com/threatgeek/arc...,['Enterprise'],[],"['Encrypted Channel: Symmetric Cryptography', ...",['T1573'],Introducing Hi Zor RAT Fidelis Cybersecurit...,['S0087']
1518,https://researchcenter.paloaltonetworks.com/20...,['Enterprise'],[],"['Application Layer Protocol: Web Protocols', ...","['T1071', 'T1560', 'T1059', 'T1140', 'T1105', ...",OilRig uses MALWARE_NAME IIS Backdoor on Ta...,['S0258']


In [24]:
for col in ['mitre_domain', 'tech_name', 'tech_id', 'software_id']:
    df[col] = df[col].apply(literal_eval)

In [25]:
tactic_dataset = "tactic_dataset.json"
path_dataset = "../src/tactic_dataset.json"

with open(path_dataset) as file: 
    open_data = json.load(file)   

In [26]:
open_data

{'TA0043': {'Tactic_ID': 'TA0043',
  'Link': ['https://attack.mitre.org//tactics/TA0043'],
  'Tactic_Name': ['Reconnaissance'],
  'Description': ['The adversary is trying to gather information they can use to plan future operations.'],
  'Technique_ID': [['T1595',
    'T1592',
    'T1589',
    'T1590',
    'T1591',
    'T1598',
    'T1597',
    'T1596',
    'T1593',
    'T1594']]},
 'TA0042': {'Tactic_ID': 'TA0042',
  'Link': ['https://attack.mitre.org//tactics/TA0042'],
  'Tactic_Name': ['Resource Development'],
  'Description': ['The adversary is trying to establish resources they can use to support operations.'],
  'Technique_ID': [['T1583',
    'T1586',
    'T1584',
    'T1587',
    'T1585',
    'T1588',
    'T1608']]},
 'TA0001': {'Tactic_ID': 'TA0001',
  'Link': ['https://attack.mitre.org//tactics/TA0001'],
  'Tactic_Name': ['Initial Access'],
  'Description': ['The adversary is trying to get into your network.'],
  'Technique_ID': [['T1189',
    'T1190',
    'T1133',
    'T1200'

In [27]:
def tactic_list(tech_id):
    tactics = []
    tech_id_set = set(tech_id)
    for tactic_id in open_data:
        if len(tech_id_set.intersection(open_data[tactic_id]['Technique_ID'][0])) > 0:
            tactics.append(tactic_id)
    return tactics

In [28]:
df['tactic_id'] = df['tech_id'].apply(tactic_list)

# Export Cleaned Dataset: 

In [30]:
# Export new dataset for training: 

df.to_csv('training_dataset_full.csv')

In [31]:
len(open_data)

26