In [1]:
import os
import pandas as pd
import numpy as np


from joblib import Parallel, delayed

import androguard
from androguard.misc import AnalyzeAPK, APK

from sklearn.metrics import make_scorer, balanced_accuracy_score
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import LinearSVC

In [2]:
def analyze_APK(filename, path):
    dict_entry = {}
    file = os.path.join(path, filename)
    try:
        a = APK(file)
    except:
        print(filename)
        dict_entry[filename] = []
        return dict_entry
    

    features = a.get_activities()
    features = features + a.get_permissions()
    features = features + a.get_services()
    features = features + a.get_providers()
    if(a.is_multidex()): features = features + ['is_multidex']
    else: features = features + ['is_not_multidex']
    if(a.is_valid_APK()): features = features + ['is_valid_APK']
    else: features = features + ['is_not_valid_APK']

    dict_entry[filename] = features

    return dict_entry

In [3]:
def get_unique_features(data_dict):
    unique_tags_set = set({})

    for key in data_dict:
        unique_tags_set.update(set(data_dict[key]))
    
    unique_tags = list(unique_tags_set)
    return unique_tags


def get_features(list_features, unique_features):
    features = np.zeros(len(unique_features))
    for feature in list_features:
        if feature in unique_features:
            ind = unique_features.index(feature)
            features[ind] = 1
    return features


# Extract features from train data

In [4]:
path_train = os.path.join(os.getcwd(), "ch01-train")
filenames_train = os.listdir(path_train)

In [5]:
%%time

data_train_list = (Parallel(n_jobs = -1)(delayed(analyze_APK)(filename, path_train) for filename in filenames_train))

Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not b

CPU times: user 11.7 s, sys: 987 ms, total: 12.7 s
Wall time: 19.6 s


In [6]:
data_train = {}

for entry in data_train_list:
    key = list(entry.keys())[0]
    data_train[key] = entry[key]

In [7]:
unique_features = get_unique_features(data_train)

In [8]:
for key in data_train:
    data_train[key] = get_features(data_train[key], unique_features)

In [9]:
dataframe_train = pd.DataFrame.from_dict(data_train, orient='index', columns = unique_features)
dataframe_train["y"] = pd.Series(dataframe_train.index).apply(lambda z: int(z[-1])).values

In [10]:
dataframe_train.to_csv("train_features.csv")

# Extract features from test data

In [11]:
path_test = os.path.join(os.getcwd(), "ch01-test")
filenames_test = os.listdir(path_test)

In [12]:
%%time

data_test_list = (Parallel(n_jobs = -1)(delayed(analyze_APK)(filename, path_test) for filename in filenames_test))

Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not b

CPU times: user 11.3 s, sys: 778 ms, total: 12.1 s
Wall time: 19 s


In [13]:
data_test = {}

for entry in data_test_list:
    key = list(entry.keys())[0]
    data_test[key] = entry[key]

In [14]:
for key in data_test:
    data_test[key] = get_features(data_test[key], unique_features)

In [None]:
dataframe_test = pd.DataFrame.from_dict(data_test, orient='index', columns = unique_features)

In [None]:
dataframe_test.to_csv("test_features.csv")

# Read train and test data

In [None]:
data_train1 = pd.read_csv("train_features.csv", index_col = 0)

train = data_train1.drop(["y"], axis = 1).values

labels = data_train1.y.values

In [None]:
data_test1 = pd.read_csv("test_features.csv", index_col = 0)

test = data_test1.values

# Calculate model parameters

In [None]:
# defining parameter range 
param_grid = {'C':  [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]}  

grid = GridSearchCV(LinearSVC(), param_grid, refit = True, verbose = 3, n_jobs = -1, scoring= make_scorer(balanced_accuracy_score)) 
grid.fit(train, labels) 

In [None]:
grid.best_estimator_

# Models

In [None]:
linearSVC = LinearSVC(C = 0.1)
linearSVC.fit(train, labels)

In [None]:
preds_linearSVC = linearSVC.predict(test)

In [None]:
pd.Series(preds_linearSVC, index = data_test1.index).to_csv("test_predictions.csv", header = None)