# Laboratorio 3 - Deteccion de malware

### Importacion de librerias

In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import *
from sklearn import metrics
from quickda.explore_data import *


### Carga de dataset

In [16]:
ds = pd.read_csv("VirusSample.csv")

### Analisis exploratorio

In [17]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9795 entries, 0 to 9794
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   file    9795 non-null   object
 1   api     9795 non-null   object
 2   class   9795 non-null   object
dtypes: object(3)
memory usage: 229.7+ KB


In [18]:
ds.head()

Unnamed: 0,file,api,class
0,7ff49f2f0912352416b05c010f35f402cc79feed,"IntersectRect,GetCurrentProcess,GetVersion",Virus
1,50cc6c99ec285d0db45dde07d8fdc18d9098c5b6,"GetCaretBlinkTime,CountClipboardFormats,GetCon...",Virus
2,f77c6bd4aebacd1a01d02e0cb20642ebf2d32929,"VarR8Pow,GetClipboardViewer,GetInputDesktop,Ge...",Virus
3,349c367c5b88fbb6cafae5d7109588d7250e16b5,"SetTraceCallback,CopyAcceleratorTableW,GetProc...",Virus
4,021f4aa86b520e1d606ab26699c35546bcd00c27,"SHLoadNonloadedIconOverlayIdentifiers,VarUI8Fr...",Virus


##### El dataset cuenta unicamente con 3 columnas, no hay *null values*. Se procede a identificar las clases.

In [19]:
ds["class"].unique()

array(['Virus', 'Riskware', 'Spyware', 'Downloader', 'Dropper', 'Agent',
       'Adware', 'Trojan', 'Backdoor', 'Worms', 'Ransomware', 'Keylogger',
       'Crypt'], dtype=object)

In [20]:
ds["class"].value_counts()

Trojan        6153
Virus         2367
Backdoor       447
Worms          441
Adware         222
Agent          102
Downloader      31
Spyware         11
Ransomware      10
Riskware         4
Dropper          4
Crypt            2
Keylogger        1
Name: class, dtype: int64

##### Se identifican 13 distintas clases. Dentro de estas se encuentran registros de clases poco representativos por lo que se procede a eliminar las clases con menos de 10 registros.

### Eliminacion de clases innecesarias

In [21]:
ds.drop(ds[ds['class'] == 'Keylogger'].index, inplace = True)
ds.drop(ds[ds['class'] == 'Crypt'].index, inplace = True)
ds.drop(ds[ds['class'] == 'Dropper'].index, inplace = True)
ds.drop(ds[ds['class'] == 'Riskware'].index, inplace = True)

ds['class'].value_counts()

Trojan        6153
Virus         2367
Backdoor       447
Worms          441
Adware         222
Agent          102
Downloader      31
Spyware         11
Ransomware      10
Name: class, dtype: int64

### Guardado de Sample

In [22]:
ds.to_csv('sample.csv', index=False)

In [27]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ds['class'] = le.fit_transform(ds['class'])

### Columnas cualitativas

In [23]:
for index, row in ds.iterrows():
    if row['class'] == 'Trojan':
        ds.loc[index,'class']= 0
    elif row['class'] == 'Virus':
        ds.loc[index,'class']= 1
    elif row['class'] == 'Backdoor':
        ds.loc[index,'class']= 2
    elif row['class'] == 'Worms':
        ds.loc[index,'class']= 3
    elif row['class'] == 'Adware':
        ds.loc[index,'class']= 4
    elif row['class'] == 'Agent':
        ds.loc[index,'class']= 5
    elif row['class'] == 'Downloader':
        ds.loc[index,'class']= 6
    elif row['class'] == 'Spyware':
        ds.loc[index,'class']= 7
    elif row['class'] == 'Ransomware':
        ds.loc[index,'class']= 8

ds.head(5)

Unnamed: 0,file,api,class
0,7ff49f2f0912352416b05c010f35f402cc79feed,"IntersectRect,GetCurrentProcess,GetVersion",1
1,50cc6c99ec285d0db45dde07d8fdc18d9098c5b6,"GetCaretBlinkTime,CountClipboardFormats,GetCon...",1
2,f77c6bd4aebacd1a01d02e0cb20642ebf2d32929,"VarR8Pow,GetClipboardViewer,GetInputDesktop,Ge...",1
3,349c367c5b88fbb6cafae5d7109588d7250e16b5,"SetTraceCallback,CopyAcceleratorTableW,GetProc...",1
4,021f4aa86b520e1d606ab26699c35546bcd00c27,"SHLoadNonloadedIconOverlayIdentifiers,VarUI8Fr...",1


### Balanceo de datos

In [28]:
features = ds.loc[:, ds.columns != 'class']
target = ds["class"]

In [29]:
f_ds = pd.DataFrame(features,dtype=str)
f_ds.apply(le.fit_transform)

f_ds = pd.DataFrame(f_ds.apply(le.fit_transform).values[:,:])
f_ds

Unnamed: 0,0,1
0,4918,1251
1,3124,604
2,9465,2525
3,2043,2265
4,83,2145
...,...,...
9779,2260,1627
9780,3255,2077
9781,8438,2737
9782,7908,1218


In [36]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(sampling_strategy = {0: 6153, 1: 2367, 2: 447, 3: 441, 4: 222, 5: 102, 6: 31, 7: 11, 8: 10})

X, y = oversample.fit_resample(f_ds, target)

In [37]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X)

f_ds = pd.DataFrame(scaler.transform(X))
f_ds

Unnamed: 0,0,1
0,0.502709,0.420929
1,0.319329,0.203230
2,0.967495,0.849596
3,0.208832,0.762113
4,0.008484,0.721736
...,...,...
9779,0.231013,0.547443
9780,0.332720,0.698856
9781,0.862517,0.920929
9782,0.808341,0.409825


#### Construccion de Modelos

##### Se implementaron dos modelos distintos a los que se utilizaron en el articulo “*New Datasets for Dinamyc Malware Classification*”.

### Division de datasets

In [38]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(f_ds, y, test_size=0.3, random_state=22)


### Arbol de decision

In [40]:
from sklearn import tree
clf_decision_tree = tree.DecisionTreeClassifier()
clf_decision_tree.fit(x_train, y_train)

DecisionTreeClassifier()

#### Prediccion

In [45]:
predict_test = clf_decision_tree.predict(x_test)


#### Accuracy de arbol de decision

In [47]:
metrics.accuracy_score(predict_test, y_test)

0.8719346049046321

#### Matriz de confusion

In [51]:
from sklearn.metrics import *
print(classification_report(y_test, predict_test))

              precision    recall  f1-score   support

           0       0.91      0.92      0.91      1858
           1       0.87      0.86      0.86       707
           2       0.82      0.86      0.84       118
           3       0.67      0.66      0.66       137
           4       0.59      0.61      0.60        69
           5       0.48      0.42      0.45        31
           6       0.31      0.50      0.38         8
           7       0.00      0.00      0.00         4
           8       0.00      0.00      0.00         4

    accuracy                           0.87      2936
   macro avg       0.52      0.54      0.52      2936
weighted avg       0.87      0.87      0.87      2936



#### Cross validation y evaluacion

In [57]:
from sklearn.model_selection import KFold, cross_val_score
cv = KFold(n_splits=10, random_state=1, shuffle=True)

cross_val_score(clf_decision_tree, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

array([0.87883212, 0.88467153, 0.87883212, 0.86423358, 0.87153285,
       0.86277372, 0.85985401, 0.84671533, 0.86695906, 0.87573099])

#### Resultados Decision Tree

##### En este caso el que ha tenido mayor eficacia con respecto a la precision y recall ha sido el troyano con una accuracy de mas o menos 0.8 a diferencia del Naive Bayes ya que se tienen metricas altas en comparacion a los demas malware presentes en el dataset.



### Naive Bayes

In [58]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes = MultinomialNB()
naive_bayes.fit(x_train, y_train)

MultinomialNB()

#### Prediccion

In [62]:
predict_test = naive_bayes.predict(x_test)

#### Accuracy 

In [64]:
metrics.accuracy_score(predict_test, y_test)

0.6328337874659401

#### Cross validation y evaluacion

In [66]:
cv = KFold(n_splits=10, random_state=1, shuffle=True)

cross_val_score(naive_bayes, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

array([0.60583942, 0.63649635, 0.62481752, 0.63357664, 0.63211679,
       0.64963504, 0.60875912, 0.62189781, 0.63304094, 0.62573099])

#### Resultados Naive Bayes

##### Ahora con respecto a este modelo se tiene un accuracy de 0.63. De igual manera que en el modelo previo el malware que presenta una mayor precision, recall y fi-score ha sido el troyano, sus metricas se quedan en 0.


¿se lograron obtener mejores métricas que las obtenidas en el artículo para la clasificación de malware? 

No fue porible obtener un resultado mas alto en comparacion al del articulo. No obstante, se ha podido evidenciar que el malware troyano es el que presenta mejores metricas en los modelos, en el articulo fueron otros concretamente backdoor, agent y worms.