<a href="https://colab.research.google.com/github/namph009/sampleapp/blob/main/test_CNNv2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Library

In [None]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesClassifier

import tensorflow as tf
from keras import layers
from keras import backend as K
from keras import regularizers
from keras.constraints import max_norm
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.models import load_model
from keras.models import Model
from keras.initializers import glorot_uniform
from keras.layers import Input,Dense,Activation,ZeroPadding2D,BatchNormalization,Flatten,Conv2D,AveragePooling2D,MaxPooling2D,Dropout

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df2=pd.read_csv("/content/drive/MyDrive/Dataset/drebin-215-dataset-5560malware-9476-benign.csv", nrows=15005, usecols=range(15, 216))

In [None]:
df2

## Data Processing

In [None]:
classes,count = np.unique(df2['class'],return_counts=True)
#Perform Label Encoding
lbl_enc = LabelEncoder()
print(lbl_enc.fit_transform(classes),classes)
df2 = df2.replace(classes,lbl_enc.fit_transform(classes))

#Dataset contains special characters like ''?' and 'S'. Set them to NaN and use dropna() to remove them
df2=df2.replace('[?,S]',np.NaN,regex=True)
print("Total missing values : ",sum(list(df2.isna().sum())))
df2.dropna(inplace=True)
for c in df2.columns:
    df2[c] = pd.to_numeric(df2[c])
df2

In [None]:
df2.shape

## vẽ vu vơ


In [None]:
corr_matrix = df2.corr().abs()

In [None]:
plt.figure(figsize = (20,10))

In [None]:
sns.heatmap(corr_matrix)
plt.show()

In [None]:
#%% [markdown]
# A rather neat looking zero correlation heatmap.
# However since the number of features are not clearly visible (due to display size) a distribution plot of the correlation matrix will 
# show how values are intertwined with each other.

#%%
dist_features = corr_matrix.values.flatten()

sns.distplot(dist_features, color="Red", label="train")

#%% [markdown]
# A sharp spike and nothing else, this proves that the columns in the dataset are uncorrelated with each other.
# Lets extract the significant features from the dataset, this can be achieved using random forest classifiers feature extrator routine

In [None]:
plt.bar(classes,count)
plt.title("Class balance")
plt.xlabel("Classes")
plt.ylabel("Count")
plt.show()

## test với model ExtraTreesClassifier

In [None]:
X, X_test, Y, y_test = train_test_split(df2[df2.columns[:len(df2.columns)-1]].to_numpy(),df2[df2.columns[-1]].to_numpy(),test_size = 0.1,shuffle=True)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X,Y, test_size = 0.1,shuffle=True)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)
print(X_test.shape)
print(y_test.shape)

Modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from xgboost import XGBClassifier

from sklearn.metrics import mean_absolute_error, accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold, StratifiedShuffleSplit


In [None]:
cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=15)

##Logistic Regression

In [None]:
# Building our model with K-fold validation and GridSearch to find the best parameters

# Defining all the parameters
params = {
    'penalty': ['l1','l2'],
    'C': [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,2,3,4,5,6,7,8,9,10]
}

# Building model
logreg = LogisticRegression(solver='liblinear')

# Parameter estimating using GridSearch
grid = GridSearchCV(logreg, param_grid=params, scoring='accuracy', n_jobs =-1, cv=cv, verbose=1)

# Fitting the model
grid.fit(X_train, y_train)

In [None]:
print('Best Score:', grid.best_score_)
print('Best Params:', grid.best_params_)
print('Best Estimator:', grid.best_estimator_)

In [None]:

logreg_grid = grid.best_estimator_
y_pred = logreg_grid.predict(X_test)

In [None]:
pd.DataFrame(confusion_matrix(y_test,y_pred), columns=["Predicted A", "Predicted T"], index=["Actual A","Actual T"] )

In [None]:
logreg_grid_score = accuracy_score(y_test, y_pred)
print('Model Accuracy:', logreg_grid_score)
print('Classification Report:\n', classification_report(y_test, y_pred))

## Gradient Boosting Machines - XGBoost

In [None]:
# Building our model with K-fold validation and GridSearch to find the best parameters

# Defining all the parameters
params = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

# Building model
xgb = XGBClassifier(objective='binary:logistic')

# Parameter estimating using GridSearch
grid = GridSearchCV(xgb, param_grid=params, scoring='accuracy', n_jobs =-1, cv=cv, verbose=1)

# Fitting the model
grid.fit(X_train, y_train)

## TEST with RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=100)

In [None]:
clf.fit(X_train,y_train)

In [None]:
y_pred=clf.predict(X_valid)

In [None]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_valid, y_pred))

In [None]:
len(clf.estimators_)

## deep learning

In [None]:
def Convnet(input_shape = (10,10,2),classes = 1):

    X_input = Input(input_shape)
 
    # Stage 1 input
    X = Conv2D(64,kernel_size=(3,3),strides=(1,1),name="conv1",kernel_initializer=glorot_uniform(seed=0))(X_input)
    X = BatchNormalization()(X)
    X = Activation("tanh")(X)
    X = Dropout(rate=0.2)(X)
    
    # Stage 2 hidden
    X = Conv2D(128,kernel_size=(2,2),strides=(1,1),name="conv1",kernel_initializer=glorot_uniform(seed=0))(X_input)
    X = BatchNormalization()(X)
    X = Activation("tanh")(X)
    X = Dropout(rate=0.1)(X)
    
    X = Conv2D(128,kernel_size=(3,3),strides=(2,2),name="conv1",kernel_initializer=glorot_uniform(seed=0))(X_input)
    X = BatchNormalization()(X)
    X = Activation("tanh")(X)
    X = Dropout(rate=0.1)(X)
    
    X = Conv2D(256,kernel_size=(2,2),strides=(1,1),name="conv1",kernel_initializer=glorot_uniform(seed=0))(X_input)
    X = BatchNormalization()(X)
    X = Activation("tanh")(X)
    X = Dropout(rate=0.1)(X)
    
    # Stage 3 output
    X = Conv2D(64,kernel_size=(2,2),strides=(2,2),name="conv1",kernel_initializer=glorot_uniform(seed=0))(X_input)
    X = BatchNormalization()(X)
    X = Activation("tanh")(X)
 
    X = Flatten()(X)
    X = Dense(classes, activation='sigmoid')(X)
 

    model = Model(inputs=X_input,outputs=X)
 
    return model


In [None]:
print(X_train.shape)
print(X_valid.shape)
print(y_train.shape)
print(y_valid.shape)

In [None]:
X_train=np.reshape(X_train,(X_train.shape[0],10,10,2))
X_valid=np.reshape(X_valid,(X_valid.shape[0],10,10,2))
X_test = np.reshape(X_test,(X_test.shape[0],10,10,2))

In [None]:
X_train

In [None]:
model = Convnet()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'binary_crossentropy'])

In [None]:
history = model.fit(X_train, y_train, epochs=50, batch_size=100, validation_data=(X_valid, y_valid),verbose = 1)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
import matplotlib.pyplot as plt

# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
...
# list all data in history
print(history.history.keys())

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
loss, accuracy,*is_anything_else_being_returned = model.evaluate(X_test, y_test, verbose=1)
loss_v, accuracy_v, *is_anything_else_being_returned= model.evaluate(X_valid, y_valid, verbose=1)
model.save("model.h5")

In [None]:
print("Validation: accuracy = %f  ;  loss_v = %f" % (accuracy_v, loss_v))
print("Test: accuracy = %f  ;  loss = %f" % (accuracy, loss))