In [1]:
from pathlib import Path
import pandas as pd

normal_apps_path = Path("/run/media/mix/64B2AAFCB2AAD1BA/ExAPP/apps.csv")
normal_apps = pd.read_csv(normal_apps_path)

malware_paths = [*Path("/run/media/mix/64B2AAFCB2AAD1BA/APP/").glob("**/*.xlsx")]
malware_apps = pd.concat(pd.read_excel(file) for file in malware_paths)

merged = pd.DataFrame(columns=["type", "name", "pkg_name"])

for name, pkg_name, *_ in normal_apps.values:
    merged.loc[len(merged)] = ["normal", name, pkg_name]

for type, name, pkg_name, *_ in malware_apps.values:
    merged.loc[len(merged)] = [
        {"投资理财": "fraud_investment", "贷款诈骗": "fraud_loan"}[type],
        name,
        pkg_name,
    ]

merged

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,type,name,pkg_name
0,normal,QQ空间,com.qzone
1,normal,搜狗输入法,com.sohu.inputmethod.sogou
2,normal,UC浏览器,com.UCMobile
3,normal,QQ同步助手,com.tencent.qqpim
4,normal,京东,com.jingdong.app.mall
...,...,...,...
3009,fraud_investment,强国复兴,uni.khwm
3010,fraud_investment,慈联,com.dwrcsaht.hzcelz
3011,fraud_investment,强国复兴,uni.xhno
3012,fraud_investment,共富工程,uni.bckx


In [2]:
import sqlite3
import json
from tqdm.notebook import tqdm

conn = sqlite3.connect("./malware.db")

# [pkg_name, [{'path': '/', 'sha256': '...'}, ...]]
unique_files_per_app = pd.DataFrame(
    columns=["pkg_name", "unique_files"],
    data=conn.execute(
        "SELECT pkg_name, unique_files FROM unique_files_per_app"
    ).fetchall(),
)

In [3]:
metadata_merged = pd.DataFrame(
    columns=[
        "type",
        "label",
        "pkg_name",
        "permissions",
        "activities",
        "services",
        "receivers",
        "providers",
        "unique_files",
    ]
)

for type, label, pkg_name, *_ in tqdm(merged.values):
    result = conn.execute(
        "SELECT permissions, activities, services, receivers, providers FROM apps WHERE pkg_name = ?",
        (pkg_name,),
    ).fetchone()
    if result is None:
        continue
    permissions, activities, services, receivers, providers = result
    # find the unique files
    unique_files, *_ = unique_files_per_app[
        unique_files_per_app["pkg_name"] == pkg_name
    ]["unique_files"].values
    unique_file_paths = [file["path"] for file in json.loads(unique_files)]
    metadata_merged.loc[len(metadata_merged)] = [
        type,
        label,
        pkg_name,
        json.loads(permissions),
        json.loads(activities),
        json.loads(services),
        json.loads(receivers),
        json.loads(providers),
        unique_file_paths,
    ]

metadata_merged.to_csv("metadata_merged.csv", index=False)

metadata_merged

  0%|          | 0/3014 [00:00<?, ?it/s]

Unnamed: 0,type,label,pkg_name,permissions,activities,services,receivers,providers,unique_files
0,normal,QQ空间,com.qzone,[oicq.wlogin_sdk.permission.WloginProvider.REA...,"[com.tencent.sc.activity.SplashActivity, com.q...","[com.qq.e.comm.DownloadService, com.tencent.up...",[com.qzonex.module.upgrade.service.YYBService$...,[com.tencent.component.app.ServiceManager.Prov...,"[resources/AndroidManifest.xml, resources/META..."
1,normal,搜狗输入法,com.sohu.inputmethod.sogou,"[android.permission.QUERY_ALL_PACKAGES, androi...",[com.sogou.stick.route.HostFileReceiverActivit...,"[com.sohu.inputmethod.sogou.SogouIME, com.sogo...",[com.sogou.imskit.feature.settings.status.Boot...,"[com.sogou.bu.ipc.provider.SKeyboardProvider, ...","[resources/AndroidManifest.xml, resources/MANI..."
2,normal,UC浏览器,com.UCMobile,"[android.permission.ACCESS_COARSE_LOCATION, an...","[com.uc.browser.FavoriteActivity, com.uc.brows...","[com.uc.deployment.UpgradePatchResultService, ...",[com.alipay.mobile.command.trigger.NotifyTrigg...,"[com.UCMobile.main.FileProvider, com.uc.base.p...","[resources/AndroidManifest.xml, resources/META..."
3,normal,QQ同步助手,com.tencent.qqpim,"[com.android.permission.GET_INSTALLED_APPS, an...",[com.tencent.qqpim.ui.autocheck.page.PerReqAct...,[com.tencent.ep.pushleague.impl.PushLeagueServ...,[com.tencent.qqpim.apps.health.news.HealthNews...,[com.tencent.qqpim.common.goldsdk.db.GoldDBPro...,"[resources/AndroidManifest.xml, resources/META..."
4,normal,京东,com.jingdong.app.mall,"[android.permission.REORDER_TASKS, android.per...",[com.jd.lib.developermode.home.DevModeMainActi...,[com.jingdong.app.mall.update.PausableDownload...,"[com.jingdong.app.mall.widget.JdWidget, com.ji...","[androidx.core.content.FileProvider, com.jingd...","[resources/AndroidManifest.xml, resources/META..."
...,...,...,...,...,...,...,...,...,...
2988,fraud_investment,强国复兴,uni.khwm,"[android.permission.INTERNET, android.permissi...","[io.dcloud.PandoraEntry, io.dcloud.PandoraEntr...",[io.dcloud.sdk.base.service.DownloadService],[com.taobao.weex.WXGlobalEventReceiver],"[io.dcloud.common.util.DCloud_FileProvider, io...","[resources/AndroidManifest.xml, resources/META..."
2989,fraud_investment,慈联,com.dwrcsaht.hzcelz,"[android.permission.INTERNET, android.permissi...","[com.pub.zgcishan.MainActivity, com.pub.zgcish...",[androidx.work.impl.background.systemalarm.Sys...,[com.pichillilorenzo.flutter_inappwebview.chro...,[vn.hunghd.flutterdownloader.DownloadedFilePro...,"[resources/AndroidManifest.xml, resources/META..."
2990,fraud_investment,强国复兴,uni.xhno,"[android.permission.INTERNET, android.permissi...","[io.dcloud.PandoraEntry, io.dcloud.PandoraEntr...",[io.dcloud.sdk.base.service.DownloadService],[com.taobao.weex.WXGlobalEventReceiver],"[io.dcloud.common.util.DCloud_FileProvider, io...","[resources/AndroidManifest.xml, resources/META..."
2991,fraud_investment,共富工程,uni.bckx,"[android.permission.INTERNET, android.permissi...","[io.dcloud.PandoraEntry, io.dcloud.PandoraEntr...",[io.dcloud.sdk.base.service.DownloadService],[com.taobao.weex.WXGlobalEventReceiver],"[io.dcloud.common.util.DCloud_FileProvider, io...","[resources/AndroidManifest.xml, resources/META..."


In [4]:
import torch
from sklearn.model_selection import train_test_split

torch.manual_seed(114514)

unique_types = metadata_merged["type"].unique()
num_labels = len(unique_types)

# Create a label mapping
label_dict = {label: i for i, label in enumerate(unique_types)}
id2label = {i: label for label, i in label_dict.items()}

text_label = pd.DataFrame(columns=["text", "label"])
for (
    type_,
    label,
    pkg_name,
    permissions,
    activities,
    services,
    receivers,
    providers,
    unique_files,
) in metadata_merged.values:
    texts = [
        json.dumps(
            {
                "label": label,
                "pkg_name": pkg_name,
            },
            ensure_ascii=False,
            separators=(",", ":"),
        ),
        json.dumps(
            {
                "label": label,
                "pkg_name": pkg_name,
                "manifest": {
                    "permissions": permissions,
                    "activities": activities,
                    "services": services,
                    "receivers": receivers,
                    "providers": providers,
                },
            },
            ensure_ascii=False,
            separators=(",", ":"),
        ),
        json.dumps(
            {
                "label": label,
                "pkg_name": pkg_name,
                "unique_files": unique_files,
            },
            ensure_ascii=False,
            separators=(",", ":"),
        ),
    ]
    for text in texts:
        text_label.loc[len(text_label)] = [
            text,
            label_dict[type_],
        ]


# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    text_label["text"].values,
    text_label["label"].values,
    test_size=0.1,
    random_state=114514,
    stratify=text_label["label"].values,
)


In [5]:
from torch.utils.data import Dataset, DataLoader
from transformers import (
    PreTrainedTokenizerFast,
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
)

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")


# Create a custom dataset
class AppDataset(Dataset):
    def __init__(
        self, texts, labels, tokenizer: PreTrainedTokenizerFast, max_length: int = 512
    ):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long),
        }

In [6]:
# ... existing code ...
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer

# First, we need to convert text to features for SMOTE
# SMOTE can't work directly on text, so we'll use TF-IDF to vectorize
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vec = vectorizer.fit_transform(train_texts)

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=114514)
X_train_resampled, y_train_resampled, *_ = smote.fit_resample(X_train_vec, train_labels)

# Convert resampled indices back to text
# We need to handle the synthetic samples
train_texts_resampled = []
for i, x_vec in enumerate(X_train_resampled):
    if i < len(train_texts):  # Original sample
        train_texts_resampled.append(train_texts[i])
    else:  # Synthetic sample
        # Find nearest neighbors to create a realistic text representation
        # Here we use the original text of the same class
        same_class_indices = [
            j for j, y in enumerate(train_labels) if y == y_train_resampled[i]
        ]
        if same_class_indices:
            # Choose a random original text from the same class
            idx = same_class_indices[
                torch.randint(0, len(same_class_indices), (1,)).item()
            ]
            train_texts_resampled.append(train_texts[idx])

# Create datasets with resampled data
train_dataset: AppDataset = AppDataset(
    train_texts_resampled, y_train_resampled, tokenizer
)
val_dataset: AppDataset = AppDataset(val_texts, val_labels, tokenizer)

# Create DataLoaders
batch_size = 4
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)


print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
# ... existing code ...

Training dataset size: 17943
Validation dataset size: 898


In [7]:
# cleanup memories
import gc

for df in (metadata_merged, merged, normal_apps, malware_apps, unique_files_per_app):
    df.drop(df.index, inplace=True)
    del df


conn.close()
gc.collect()


627

In [8]:
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=num_labels, id2label=id2label, label2id=label_dict
)

optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [9]:
from tqdm.notebook import tqdm
from transformers import PreTrainedModel
from transformers.optimization import Optimizer
from torch.optim.lr_scheduler import LRScheduler


# Training function
def train_epoch(
    model: PreTrainedModel,
    dataloader: DataLoader,
    optimizer: Optimizer,
    scheduler: LRScheduler,
    device: torch.device,
):
    model.train()
    total_loss = 0

    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids, attention_mask=attention_mask, labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({"loss": loss.item()})

    return total_loss / len(dataloader)


# Evaluation function
def evaluate(model: PreTrainedModel, dataloader: DataLoader, device: torch.device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(
                input_ids=input_ids, attention_mask=attention_mask, labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return total_loss / len(dataloader), all_preds, all_labels

In [None]:
from sklearn.metrics import classification_report

# Training loop
epochs = 5
best_val_loss = float("inf")

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")

    # Train
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, device)
    print(f"Training loss: {train_loss:.4f}")

    # Evaluate
    val_loss, val_preds, val_labels = evaluate(model, val_dataloader, device)
    print(f"Validation loss: {val_loss:.4f}")

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(val_labels, val_preds, target_names=unique_types))

    # Save the best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_roberta_model_ultimate.pt")
        print("Saved best model!")

# Load the best model
model.load_state_dict(torch.load("best_roberta_model_ultimate.pt"))


Epoch 1/5


Training:   0%|          | 0/4486 [00:00<?, ?it/s]

Training loss: 0.1322


Evaluating:   0%|          | 0/225 [00:00<?, ?it/s]

Validation loss: 0.0288

Classification Report:
                  precision    recall  f1-score   support

          normal       1.00      0.99      0.99       664
fraud_investment       0.98      0.99      0.98       201
      fraud_loan       0.89      0.97      0.93        33

        accuracy                           0.99       898
       macro avg       0.96      0.98      0.97       898
    weighted avg       0.99      0.99      0.99       898

Saved best model!

Epoch 2/5


Training:   0%|          | 0/4486 [00:00<?, ?it/s]