In [1]:
import os
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
np.random.seed(42)

# winwebsec

### Getting the file names of the malware family winwebsec 

In [3]:
winwebsec_files = [] # for rewriting the files start at [36:]

for filename in os.listdir("Malicia (Big 3 - Opcodes)/winwebsec"):
    nameOfFile = os.path.join('Malicia (Big 3 - Opcodes)/winwebsec', filename)
    
    winwebsec_files.append(nameOfFile)
    
print(len(winwebsec_files)) # there should 4360 files of winwebsec malicious files

4360


### Splitting the winwebsec files into training and testing sets 80:20

In [4]:
# set random_state to 42 to always get the same train_test_split results
winwebsec_training, winwebsec_testing = train_test_split(winwebsec_files, train_size = 0.8, test_size = 0.2, random_state = 42)

print(len(winwebsec_training))  # should be 3488

for train in range(10):
    print(winwebsec_training[train][36:])

print()
    
print(len(winwebsec_testing))  # should be 872

for test in range(10):
    print(winwebsec_testing[test][36:])

3488
7eb78a40097200e12edf3b10bbdf4fc6b53d1fe9.asm.txt
8c5114ee30f13d86073e9557d2fe3d11df545ebe.asm.txt
5100cdaa7f4c26a6025a3d1de8ec602588d81d9c.asm.txt
2c5a84b2105b03d60617547b243dee744b7566e7.asm.txt
41b22a35530708d131d8c8b54bc0392a06b14f15.asm.txt
cf25bb83fe4d1fb6451295946ad889fff0770f05.asm.txt
b152a2931e3f82d05fcec7fe7fdaca4d7d094c80.asm.txt
09e89f661cfe191dcbcd8f002fb7b255c195344b.asm.txt
1aff6210c4e0b38769fb3ea8150069b2fda71aac.asm.txt
a0ff7e0effc4aa189ac6edd1f4f8010700d9d8b3.asm.txt

872
9fc5f23dc66a7d66a762472fbf4ad0bdacae8f66.asm.txt
5cb131f9db17d2897543861436b2cbfb60ecc9c8.asm.txt
d04f55cfc54ea77a8ffa3fa8a62158870123a021.asm.txt
0b5f6fe05264465ac2418ad799652e2b36ac1027.asm.txt
9172559f151741034ffd00fb5d49aee3fbdd49d9.asm.txt
8f040516700e1e3945e6a4c55ab7f33d75c6f716.asm.txt
2d7f552c3a6b7b0924fb7e7ef6fb95718e535f95.asm.txt
4473a1642c5d6bb7c7aceb3fd00002dad31248c2.asm.txt
aa2096cae4be8b8cd85fcb1f1630c1da6bf9414e.asm.txt
77aa37dd1d84337352969154eb070f6eb7cd8d9c.asm.txt


### Write the list of training and testing file names into separate files

In [5]:
winwebsec_trainfile = open('Data/winwebsec/training_filenames.txt', 'w')

for filename in winwebsec_training:
    winwebsec_trainfile.write(filename[36:] + '\n')

winwebsec_trainfile.close()

winwebsec_testfile = open('Data/winwebsec/testing_filenames.txt', 'w')

for filename in winwebsec_testing:
    winwebsec_testfile.write(filename[36:] + '\n')

winwebsec_testfile.close()

In [6]:
#for data in range(len(winwebsec_training)):
#    training_file = open(winwebsec_training[data], 'r')
#    file = open('WinWebSec/Training' + winwebsec_training[data][35:], 'w')
    
#    file.write(training_file.read())
    
#    file.close()
#    training_file.close()

# zbot

### Getting the file names of the malware family zbot

In [7]:
zbot_files = []

for filename in os.listdir("Malicia (Big 3 - Opcodes)/zbot"):
    nameOfFile = os.path.join('Malicia (Big 3 - Opcodes)/zbot', filename)
    
    zbot_files.append(nameOfFile)
    
print(len(zbot_files)) # there should be 2136 files of zbot malicious files

2136


In [8]:
# set random_state to 42 to always get the same train_test_split results
zbot_training, zbot_testing = train_test_split(zbot_files, train_size = 0.8, test_size = 0.2, random_state = 42)

print(len(zbot_training))

for train in range(10):
    print(zbot_training[train][30:])

print()

print(len(zbot_testing))

for test in range(10):
    print(zbot_testing[test])

1708
/621afe95fc80148a41ec14fa9f274cb96163c33d.asm.txt
/420b2e8839320d2539dfd2db63256e94e36903ad.asm.txt
/1d2deb46f71b4899ae65f4343fa7ed1d1a2b8e28.asm.txt
/52465b605011e5ff5a49656d4986b78e34ed98ff.asm.txt
/2471140bf8c9085d6890803e2badcee1ee751da3.asm.txt
/0bc9fd762be52e557d64339194a442b9e3b5e84d.asm.txt
/b2982165f6f5222ae7a712dd6090ae913101707f.asm.txt
/8369a608acbc1e022b27dd410effc88ec0465ec4.asm.txt
/ad002fe2ba1136149f14a2354d0afc8b5f8d0f9f.asm.txt
/a484d7aee9bc8ce1e1eb897b66ae527697a861c1.asm.txt

428
Malicia (Big 3 - Opcodes)/zbot/99e01d4db01571994d6a2e3b20daa94d74c01920.asm.txt
Malicia (Big 3 - Opcodes)/zbot/a563ca78a774498fc989b53613e2c93d45d2da61.asm.txt
Malicia (Big 3 - Opcodes)/zbot/5df062eed3f37e852f322cd61a0df3f484572009.asm.txt
Malicia (Big 3 - Opcodes)/zbot/b9f6f06c9b400c3dd5864ba2366c2ecf37fd10bc.asm.txt
Malicia (Big 3 - Opcodes)/zbot/f2f7cf50215aaa184c6a0074a9aecdbe47cfc056.asm.txt
Malicia (Big 3 - Opcodes)/zbot/87b542920308360c59abb970bb92400f39d3eab1.asm.txt
Malicia (B

### Write the list of training and testing file names into separate files

In [9]:
zbot_trainfile = open('Data/zbot/training_filenames.txt', 'w')

for filename in zbot_training:
    zbot_trainfile.write(filename[31:] + '\n')

zbot_trainfile.close()

zbot_testfile = open('Data/zbot/testing_filenames.txt', 'w')

for filename in zbot_testing:
    zbot_testfile.write(filename[31:] + '\n')

zbot_testfile.close()

In [10]:
#for data in range(len(zbot_training)):
#    training_file = open(zbot_training[data], 'r')
#    file = open('Zbot/Training' + zbot_training[data][30:], 'w')
    
#    file.write(training_file.read())
    
#    file.close()
#    training_file.close()

# zeroaccess

### Getting the file names of the malware family zeroaccess

In [11]:
zeroaccess_files = []

for filename in os.listdir("Malicia (Big 3 - Opcodes)/zeroaccess"):
    nameOfFile = os.path.join('Malicia (Big 3 - Opcodes)/zeroaccess', filename)
    
    zeroaccess_files.append(nameOfFile)
    
print(len(zeroaccess_files)) # there should be 1305 files of zeroaccess files

1305


In [12]:
# set random_state to 42 to always get the same train_test_split results
zeroaccess_training, zeroaccess_testing = train_test_split(zeroaccess_files, train_size = 0.8, test_size = 0.2, random_state = 42)

print(len(zeroaccess_training))

for train in range(10):
    print(zeroaccess_training[train][36:])

print()

print(len(zeroaccess_testing))

for test in range(10):
    print(zeroaccess_testing[test])

1044
/1576e9093ff4458bfe9278427875f927ce881733.asm.txt
/043828a5201ac90eac5892814235ad90663abfd9.asm.txt
/5fb64e3299de27d3fe69cd1ec14dd53d519493fc.asm.txt
/aafcad71b159bae174416342ebfe4f05899663b0.asm.txt
/ceaab3267dfff2e5622124158e5b2df2bba4851e.asm.txt
/9dba6cab2de38311d05285f83f070eeb4794fac4.asm.txt
/1d07d8a874cc98cb7c35260d42dbb171756d01da.asm.txt
/94fec42ce30df4abfb70586489159d1a2fb3dddb.asm.txt
/ca0eb8d5b4b2eaacb5c034f5ef2e21556534dde0.asm.txt
/f6c12d03e6d0363b1e5b51d24309d7137c4db1bd.asm.txt

261
Malicia (Big 3 - Opcodes)/zeroaccess/770a3cf74e826b4844f376f53aeba5bf6777074f.asm.txt
Malicia (Big 3 - Opcodes)/zeroaccess/340d0335ea6807560f1a0e0ef4042b8ed732fbf2.asm.txt
Malicia (Big 3 - Opcodes)/zeroaccess/66865860f40cb05cbe7e6cbb806cc80ad095a693.asm.txt
Malicia (Big 3 - Opcodes)/zeroaccess/32ebe1fa6f9d4ba73179c11fe2395ec36b7cc6a8.asm.txt
Malicia (Big 3 - Opcodes)/zeroaccess/ea53e230b715ac3b196f5fb97dd16968e46caf2e.asm.txt
Malicia (Big 3 - Opcodes)/zeroaccess/90233a790384c8f48d54b03

### Write the list of training and testing file names into separate files

In [13]:
zeroaccess_trainfile = open('Data/zeroaccess/training_filenames.txt', 'w')

for filename in zeroaccess_training:
    zeroaccess_trainfile.write(filename[37:] + '\n')

zeroaccess_trainfile.close()

zeroaccess_testfile = open('Data/zeroaccess/testing_filenames.txt', 'w')

for filename in zeroaccess_testing:
    zeroaccess_testfile.write(filename[37:] + '\n')

zeroaccess_testfile.close()

In [14]:
#for data in range(len(zeroaccess_training)):
#    training_file = open(zeroaccess_training[data], 'r')
#    file = open('Zeroaccess/Training' + zeroaccess_training[data][36:], 'w')
    
#    file.write(training_file.read())
    
#    file.close()
#    training_file.close()