# <font color="blue"><b> Load the log files and merge them</b></font>
---

In [1]:
import utils.data_loader as dl 
import pandas as pd
import os
import numpy as np

---
### **Load data into dataframes**
---

In [2]:
df_honeypot, df_malware = dl.logs_to_dfs()

Reading: data/opt/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-7-1/bro/conn.log.labeled
Reading: data/opt/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-5-1/bro/conn.log.labeled
Reading: data/opt/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-4-1/bro/conn.log.labeled
Reading: data/opt/Malware-Project/BigDataset/IoTScenarios/CTU-IoT-Malware-Capture-44-1/bro/conn.log.labeled
Reading: data/opt/Malware-Project/BigDataset/IoTScenarios/CTU-IoT-Malware-Capture-8-1/bro/conn.log.labeled
Reading: data/opt/Malware-Project/BigDataset/IoTScenarios/CTU-IoT-Malware-Capture-3-1/bro/conn.log.labeled
Reading: data/opt/Malware-Project/BigDataset/IoTScenarios/CTU-IoT-Malware-Capture-20-1/bro/conn.log.labeled
Reading: data/opt/Malware-Project/BigDataset/IoTScenarios/CTU-IoT-Malware-Capture-34-1/bro/conn.log.labeled
Reading: data/opt/Malware-Project/BigDataset/IoTScenarios/CTU-IoT-Malware-Capture-42-1/bro/conn.log.labeled
Reading: data/opt/Malware-Project/BigDatas

---
### **Merge dataframes**
---

In [3]:
# merge benign dataframes
df_honeypot_all = pd.concat([df_honeypot[x] for x in range(0,len(df_honeypot))])
print('Shape of df_honeypot_all dataframe:', df_honeypot_all.shape)

# merge malware dataframes
df_malware_all = pd.concat([df_malware[x] for x in range(0,len(df_malware))])
print('Shape of df_malware_all dataframe:', df_malware_all.shape)

data_df = pd.concat([df_honeypot_all, df_malware_all])
print('Shape of concatenated dataframe:', data_df.shape)

Shape of df_honeypot_all dataframe: (1956, 23)
Shape of df_malware_all dataframe: (200809, 23)
Shape of concatenated dataframe: (202765, 23)


---
### **Save dataframes**
---

In [4]:
df_honeypot_all.to_csv('data/honeypot_all.csv', index=False)
df_malware_all.to_csv('data/malware_all.csv', index=False)
data_df.to_csv('data/ioT_data.csv', index=False)

---
### **Make a new file with the botnet name per type of malware capture**
---

In [5]:
##### Save each malware dataset after processing #####

malware_datasets_name = ['44-1', '8-1', '3-1', '20-1', '34-1', '42-1', '21-1'] # this is the order of readout as displayed above
bot_name = ['Mirai', 'Hakai', 'Muhstik','Torii', 'Mirai', 'Trojan', 'Torii']

dfs = []

for i in range(0,len(malware_datasets_name)):
    # Get each file name of malware capture
    f_name = 'CTU-IoT-Malware-Capture-' + malware_datasets_name[i]+'.csv'
    # Save it to a different csv file
    df_malware[i].to_csv('data/'+ f_name, index=False)
    
    # Read the file and correct benign detailed labels
    df = pd.read_csv(os.path.join('data', f_name))
    df.detailed_label = df.detailed_label.replace({'-':'benign'})    
    
    # Make a new dataframe which counts the the entries per detailed label and adds the botnet name
    new_df = df.detailed_label.dropna(axis=0).value_counts().to_frame(name='counts').reset_index()
    new_df['Botnet'] = bot_name[i]
    # Append to list
    dfs.append(new_df)    

# Save the concatenated dataframe to a new file    
df_botnets = pd.concat([dfs[x] for x in range(0,len(dfs))])   
df_botnets = df_botnets.rename(columns = {'index':'Malware_type'})
df_botnets.to_csv('data/botnets.csv', index=False)      