In [1]:
import os
import pandas as pd
import numpy as np


from joblib import Parallel, delayed

import androguard
from androguard.misc import AnalyzeAPK, APK

from sklearn.metrics import make_scorer, balanced_accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import LinearSVC

In [2]:
def analyze_APK(filename, path):
    dict_entry = {}
    file = os.path.join(path, filename)
    try:
        a = APK(file)
    except:
        print(filename)
        dict_entry[filename] = []
        return dict_entry
    
    features =  a.get_permissions()
    features = features + a.get_services()

    for i in a.uses_permissions:
        features.append(i[0])
        
    features = features + a.get_requested_third_party_permissions()

    features = features + list(a.permission_module.keys())
    
    files = a.get_activities()
    unique = {()}
    for file in files:
        name, ext = os.path.splitext(file)
        unique.add(ext)
    features = features + list(unique)

    if(a.is_multidex()): features = features + ['is_multidex']
    else: features = features + ['is_not_multidex']
    if(a.is_valid_APK()): features = features + ['is_valid_APK']
    else: features = features + ['is_not_valid_APK']

    dict_entry[filename] = features

    return dict_entry

In [3]:
def get_unique_features(data_dict):
    unique_tags_set = set({})

    for key in data_dict:
        unique_tags_set.update(set(data_dict[key]))
    
    unique_tags = list(unique_tags_set)
    return unique_tags


def get_features(list_features, unique_features):
    features = np.zeros(len(unique_features))
    for feature in list_features:
        if feature in unique_features:
            ind = unique_features.index(feature)
            features[ind] = 1
    return features


# Extract features from train data

In [4]:
path_train = os.path.join(os.getcwd(), "ch01-train")
filenames_train = os.listdir(path_train)

In [5]:
%%time

data_train_list = (Parallel(n_jobs = -1)(delayed(analyze_APK)(filename, path_train) for filename in filenames_train))

Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not b

CPU times: user 12.1 s, sys: 1.11 s, total: 13.2 s
Wall time: 24.7 s


In [6]:
data_train = {}

for entry in data_train_list:
    key = list(entry.keys())[0]
    data_train[key] = entry[key]

In [7]:
unique_features = get_unique_features(data_train)

In [8]:
for key in data_train:
    data_train[key] = get_features(data_train[key], unique_features)

In [9]:
dataframe_train = pd.DataFrame.from_dict(data_train, orient='index', columns = unique_features)
dataframe_train["y"] = pd.Series(dataframe_train.index).apply(lambda z: int(z[-1])).values

In [10]:
dataframe_train.to_csv("train_features_solution2.csv")

# Extract features from test data

In [11]:
path_test = os.path.join(os.getcwd(), "ch01-test")
filenames_test = os.listdir(path_test)

In [12]:
%%time

data_test_list = (Parallel(n_jobs = -1)(delayed(analyze_APK)(filename, path_test) for filename in filenames_test))

Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 10 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not be found, using 19 instead
Requested API Level could not b

CPU times: user 11.4 s, sys: 906 ms, total: 12.4 s
Wall time: 19.5 s


In [13]:
data_test = {}

for entry in data_test_list:
    key = list(entry.keys())[0]
    data_test[key] = entry[key]

In [14]:
for key in data_test:
    data_test[key] = get_features(data_test[key], unique_features)

In [15]:
dataframe_test = pd.DataFrame.from_dict(data_test, orient='index', columns = unique_features)

In [16]:
dataframe_test.to_csv("test_features_solution2.csv")

# Read train and test data

In [17]:
data_train1 = pd.read_csv("train_features_solution2.csv", index_col = 0)

train = data_train1.drop(["y"], axis = 1).values

labels = data_train1.y.values

In [18]:
data_test1 = pd.read_csv("test_features_solution2.csv", index_col = 0)

test = data_test1.values

# Calculate model parameters

In [19]:
# defining parameter range 
param_grid = {'C':  [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]}  

grid = GridSearchCV(LinearSVC(max_iter=3000), param_grid, refit = True, verbose = 3, n_jobs = -1, scoring=make_scorer(lambda *z : f1_score(*z, average='micro'))) 
grid.fit(train, labels) 

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END ........................................C=0.001; total time=   0.3s
[CV 2/5] END ........................................C=0.001; total time=   0.2s
[CV 3/5] END ........................................C=0.001; total time=   0.2s
[CV 4/5] END ........................................C=0.001; total time=   0.2s
[CV 5/5] END ........................................C=0.001; total time=   0.2s
[CV 1/5] END .........................................C=0.01; total time=   0.3s
[CV 2/5] END .........................................C=0.01; total time=   0.3s
[CV 3/5] END .........................................C=0.01; total time=   0.3s
[CV 4/5] END .........................................C=0.01; total time=   0.3s
[CV 5/5] END .........................................C=0.01; total time=   0.3s
[CV 1/5] END .........................................C=0.05; total time=   0.7s
[CV 2/5] END ....................................



[CV 4/5] END ..........................................C=0.5; total time=   2.8s
[CV 5/5] END ..........................................C=0.5; total time=   2.7s




[CV 1/5] END ............................................C=1; total time=   2.8s




[CV 2/5] END ............................................C=1; total time=   2.8s




[CV 3/5] END ............................................C=1; total time=   2.9s




[CV 4/5] END ............................................C=1; total time=   2.9s




[CV 5/5] END ............................................C=1; total time=   3.0s




[CV 1/5] END ............................................C=5; total time=   2.6s




[CV 2/5] END ............................................C=5; total time=   2.6s




[CV 3/5] END ............................................C=5; total time=   2.7s




[CV 4/5] END ............................................C=5; total time=   2.6s




[CV 5/5] END ............................................C=5; total time=   2.7s




[CV 1/5] END ...........................................C=10; total time=   2.6s




[CV 2/5] END ...........................................C=10; total time=   2.6s




[CV 3/5] END ...........................................C=10; total time=   2.5s




[CV 4/5] END ...........................................C=10; total time=   2.6s




[CV 5/5] END ...........................................C=10; total time=   2.9s


GridSearchCV(estimator=LinearSVC(max_iter=3000), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]},
             scoring=make_scorer(<lambda>), verbose=3)

In [20]:
grid.best_estimator_

LinearSVC(C=0.1, max_iter=3000)

# Models

In [21]:
linearSVC = LinearSVC(C = 0.1)
linearSVC.fit(train, labels)

LinearSVC(C=0.1)

In [22]:
preds_linearSVC = linearSVC.predict(test)

In [23]:
pd.Series(preds_linearSVC, index = data_test1.index).to_csv("test_predictions_solution2.csv", header = None)

In [24]:
grid.best_score_

0.51

In [25]:
grid.best_score_

0.623

In [24]:
grid.best_score_

0.7360000000000001