In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
import seaborn as sns
from xgboost import XGBClassifier

In [None]:
plt.rcParams['figure.figsize'] = [10, 5]

In [None]:
df = pd.read_csv('data/Android_Malware.csv', low_memory = False)
df.head()

Dzielenie na train/test/ walidacje

In [None]:
X_train, X_test_tmp, y_train, y_test_tmp = train_test_split(
    df[df.columns.values[:-1]],
    df[df.columns.values[-1]],
    test_size=0.4, random_state=213)

In [None]:
X_test, X_validation, y_test, y_validation = train_test_split(
    X_test_tmp,
    y_test_tmp,
    test_size=0.5, random_state=7)

In [None]:
X_validation

In [None]:
df = X_train 
df["Label"] = y_train

In [None]:
df.info()

jak wygladaja najpopularniejsze wartosci w kazdej z kolumn

In [None]:
for col in df.columns:
    print(f"Top 10 most common values in {col}:")
    print(df[col].value_counts().nlargest(10))
    print()

ilosc unikalnych wartosci w kazdej z kolumn

In [None]:
with pd.option_context('display.max_rows', None):
    print(df.nunique(dropna=False))

rozklad przewidywanej cechy

In [None]:
df.iloc[:, -1].hist()

"Adware jest niechcianym oprogramowaniem służącym do wyświetlania reklam na Twoim ekranie". Adware generuje przychody dla swoich twórców dzięki automatycznemu wyświetlaniu internetowych reklam w interfejsie użytkownika danej aplikacji lub w formie okienek wyskakujących podczas procesu instalacji.

Scareware is a type of malware attack that claims to have detected a virus or other issue on a device and directs the user to download or buy malicious software to resolve the problem. Generally speaking, scareware is the gateway to a more intricate cyberattack and not an attack in and of itself.

benign - having no harmful influence or effect

Wszystkie NA są skumulowane we wierszach które można policzyć na palcach jedner ręki można je usunąć

In [None]:
df = df.dropna()

Usunięcie kolumn które mają tylko jedną wartość

In [None]:
col_to_drop = df.columns[df.nunique() <= 1]
df = df.drop(columns = col_to_drop)

Usunięcie kolumn ID, i dwóch kolumn w których są same zera ale zapisane na różne sposoby (int, str, float)

Ta kolumna ma wartości które są int i float, zrzutujemy je na int

In [None]:
df.info()

In [None]:
def select_correlated_columns_to_remove(df, corr_treshold):
    cor_lis = []
    cor = df.corr(method="spearman")
    for i, _ in enumerate(cor.columns):
        for j, col_name_to_drop in enumerate(cor.columns):
            if i < j and (abs(cor.iloc[i,j]) > corr_treshold or math.isnan(cor.iloc[i,j])):
                cor_lis.append(col_name_to_drop)
    return cor_lis

In [None]:
cor_lis = select_correlated_columns_to_remove(df, 0.95)
df=df.drop(cor_lis,axis =1 )

In [None]:
def transform_data(X,y):
    df = X
    df["Label"] = y
    df = df.dropna()
    df = df.drop(columns = col_to_drop)
    df = df.drop(columns = ["Unnamed: 0", "Flow ID", " CWE Flag Count", "Fwd Avg Bytes/Bulk"])
    
    
    ip = list(df[' Source IP'])
    ip = [[int(ip[i].split('.')[j]) for j in range(len(ip[i].split('.')))] for i in range(len(ip))]
    ip = pd.DataFrame(ip).fillna(0)
    ip.columns = ['Source IP1','Source IP2',"Source IP3","Source IP4"]
    ip2 = list(df[' Destination IP'])
    ip2 = [[int(ip2[i].split('.')[j]) for j in range(len(ip2[i].split('.')))] for i in range(len(ip2))]
    ip2 = pd.DataFrame(ip2).fillna(0)
    ip2.columns = ['Destination IP1','Destination IP2',"Destination IP3","Destination IP4"]
    ip = pd.concat([ip,ip2], axis = 1)
    df = pd.concat([df.reset_index(),ip],axis = 1).drop('index', axis = 1)
    
    
    mapping_dict = {"Benign": 0,
                "Android_Scareware":1,
                "Android_Adware": 2,
                "Android_SMS_Malware":3}
    
  
    
    df["Label"] = pd.Series(df["Label"]).map(mapping_dict)
    
    one_hot = pd.get_dummies(df[' Protocol'])
    df = df.drop([' Protocol',' Timestamp'," Source IP"," Destination IP",' Down/Up Ratio'],axis = 1)
 
    
    df = df.drop(cor_lis, axis = 1, errors='ignore').astype('double')
    return df.drop("Label", axis=1), df["Label"]

In [None]:
X_test, y_test = transform_data(X_validation, y_validation)
X_train, y_train = transform_data(X_train, y_train)

In [None]:
#le = LabelEncoder()
#df = pd.concat([X_train,X_test], axis = 0)
#len(X_train)

In [None]:
#df[" Source IP"] = le.fit_transform(df[" Source IP"])
#df[" Destination IP"] = le.fit_transform(df[" Destination IP"])
#X_train = df.iloc[0:len(X_train)].astype('double')
#X_test = df.iloc[len(X_train):len(df)].astype('double')

In [None]:
X_train.info()

In [None]:
X_test.info()

In [None]:
model=XGBClassifier(random_state=1,
                    learning_rate=0.25,
                    booster='gbtree', 
                    max_depth=15,
                    n_estimators=300
                    )
model.fit(X_train, y_train)
model.score(X_test,y_test)

In [None]:
mapping_dict = {"Benign": 0,
            "Android_Scareware":1,
            "Android_Adware": 2,
            "Android_SMS_Malware":3}

In [None]:
y_predicted = model.predict(X_test)
conf = confusion_matrix(y_test,y_predicted)
print(conf)
sns.heatmap(conf,annot=True,cmap='Blues', fmt='g', 
            yticklabels=['Benign', 'Android_Scareware', 'Android_Adware', 'Android_SMS_Malware'],
            xticklabels=['Benign', 'Android_Scareware', 'Android_Adware', 'Android_SMS_Malware'])
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title("confusion_matrix")

In [None]:
axis_labels = ['Benign', 'Scareware', 'Adware', 'SMS Malware']
y_predicted = model.predict(X_test)
conf = confusion_matrix(y_test,y_predicted)

#labels
labels = np.zeros((len(conf), len(conf)), dtype=object)
for i,row in enumerate(conf):
    for j,cell in enumerate(row):
        labels[i][j] = f"{cell}\n{round(cell*100/sum(conf[i]),1)}%" 

sns.heatmap(conf,annot=labels,
            cmap='Blues', fmt = '',  
            yticklabels=axis_labels,
            xticklabels=axis_labels)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title("confusion_matrix")

In [None]:
from xgboost import plot_importance
plot_importance(model, max_num_features=10, height= 0.6)
plt.grid(axis='y')
plt.show()