In [1]:
from pathlib import Path
import pandas as pd

normal_apps_path = Path("/run/media/mix/64B2AAFCB2AAD1BA/ExAPP/apps.csv")
normal_apps = pd.read_csv(normal_apps_path)

normal_apps

Unnamed: 0,name,pkg_name,path
0,QQ空间,com.qzone,/run/media/mix/64B2AAFCB2AAD1BA/ExAPP/apks/995...
1,搜狗输入法,com.sohu.inputmethod.sogou,/run/media/mix/64B2AAFCB2AAD1BA/ExAPP/apks/686...
2,UC浏览器,com.UCMobile,/run/media/mix/64B2AAFCB2AAD1BA/ExAPP/apks/109...
3,QQ同步助手,com.tencent.qqpim,/run/media/mix/64B2AAFCB2AAD1BA/ExAPP/apks/846...
4,京东,com.jingdong.app.mall,/run/media/mix/64B2AAFCB2AAD1BA/ExAPP/apks/719...
...,...,...,...
2226,极速数据恢复,com.ansxtech.wx,/run/media/mix/64B2AAFCB2AAD1BA/ExAPP/apks/540...
2227,中国好故事,com.chinaso.stories,/run/media/mix/64B2AAFCB2AAD1BA/ExAPP/apks/539...
2228,蜂鸟部落,com.fengniaobuluocps,/run/media/mix/64B2AAFCB2AAD1BA/ExAPP/apks/539...
2229,云智充,com.dream.zncd.intelligentcharge,/run/media/mix/64B2AAFCB2AAD1BA/ExAPP/apks/423...


In [2]:
malware_paths = [*Path("/run/media/mix/64B2AAFCB2AAD1BA/APP/").glob("**/*.xlsx")]
malware_apps = pd.concat(pd.read_excel(file) for file in malware_paths)

malware_apps

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,诈骗类型,APP名称,APP包名,apk文件名
0,投资理财,共富工程,uni.tlfy,1719354472896
1,投资理财,慈联,com.zjasmdxa.fiyyrs,1719372932233
2,投资理财,国能APP,com.ekatyhlo.mirpwb,1719402160462
3,投资理财,共富工程,uni.xqrh,1719347300879
4,贷款诈骗,还呗,com.example.byaf.byaf,1719398212372
...,...,...,...,...
193,投资理财,强国复兴,uni.khwm,1719713801202
194,投资理财,慈联,com.dwrcsaht.hzcelz,1719701530760
195,投资理财,强国复兴,uni.xhno,1719698243724
196,投资理财,共富工程,uni.bckx,1719699541723


In [3]:
merged = pd.DataFrame(columns=["type", "name", "pkg_name"])

for name, pkg_name, *_ in normal_apps.values:
    merged.loc[len(merged)] = ["normal", name, pkg_name]

for type, name, pkg_name, *_ in malware_apps.values:
    merged.loc[len(merged)] = [
        {"投资理财": "fraud_investment", "贷款诈骗": "fraud_loan"}[type],
        name,
        pkg_name,
    ]

merged

Unnamed: 0,type,name,pkg_name
0,normal,QQ空间,com.qzone
1,normal,搜狗输入法,com.sohu.inputmethod.sogou
2,normal,UC浏览器,com.UCMobile
3,normal,QQ同步助手,com.tencent.qqpim
4,normal,京东,com.jingdong.app.mall
...,...,...,...
3009,fraud_investment,强国复兴,uni.khwm
3010,fraud_investment,慈联,com.dwrcsaht.hzcelz
3011,fraud_investment,强国复兴,uni.xhno
3012,fraud_investment,共富工程,uni.bckx


In [4]:
import torch
from sklearn.model_selection import train_test_split

torch.manual_seed(114514)

unique_types = merged["type"].unique()
num_labels = len(unique_types)

# Create a label mapping
label_dict = {label: i for i, label in enumerate(unique_types)}
id2label = {i: label for label, i in label_dict.items()}

# Prepare the features (name:pkg_name) and labels
merged["text"] = merged["name"] + ":" + merged["pkg_name"]
merged["label"] = merged["type"].map(label_dict)

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    merged["text"].values,
    merged["label"].values,
    test_size=0.1,
    random_state=114514,
    stratify=merged["label"].values,
)


In [5]:
from torch.utils.data import Dataset, DataLoader
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
)

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")


# Create a custom dataset
class AppDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long),
        }

In [6]:
# ... existing code ...
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer

# First, we need to convert text to features for SMOTE
# SMOTE can't work directly on text, so we'll use TF-IDF to vectorize
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vec = vectorizer.fit_transform(train_texts)

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=114514)
X_train_resampled, y_train_resampled, *_ = smote.fit_resample(X_train_vec, train_labels)

# Convert resampled indices back to text
# We need to handle the synthetic samples
train_texts_resampled = []
for i, x_vec in enumerate(X_train_resampled):
    if i < len(train_texts):  # Original sample
        train_texts_resampled.append(train_texts[i])
    else:  # Synthetic sample
        # Find nearest neighbors to create a realistic text representation
        # Here we use the original text of the same class
        same_class_indices = [
            j for j, y in enumerate(train_labels) if y == y_train_resampled[i]
        ]
        if same_class_indices:
            # Choose a random original text from the same class
            idx = same_class_indices[
                torch.randint(0, len(same_class_indices), (1,)).item()
            ]
            train_texts_resampled.append(train_texts[idx])

# Create datasets with resampled data
train_dataset: AppDataset = AppDataset(
    train_texts_resampled, y_train_resampled, tokenizer
)
val_dataset: AppDataset = AppDataset(val_texts, val_labels, tokenizer)

# Create DataLoaders (unchanged)
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
# ... existing code ...

In [7]:
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=num_labels, id2label=id2label, label2id=label_dict
)

optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [8]:
from tqdm.notebook import tqdm


# Training function
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids, attention_mask=attention_mask, labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({"loss": loss.item()})

    return total_loss / len(dataloader)

In [9]:
# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(
                input_ids=input_ids, attention_mask=attention_mask, labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return total_loss / len(dataloader), all_preds, all_labels

In [10]:
from sklearn.metrics import classification_report

# Training loop
epochs = 5
best_val_loss = float("inf")

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")

    # Train
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, device)
    print(f"Training loss: {train_loss:.4f}")

    # Evaluate
    val_loss, val_preds, val_labels = evaluate(model, val_dataloader, device)
    print(f"Validation loss: {val_loss:.4f}")

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(val_labels, val_preds, target_names=unique_types))

    # Save the best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_roberta_model.pt")
        print("Saved best model!")

# Load the best model
model.load_state_dict(torch.load("best_roberta_model.pt"))


Epoch 1/5


Training:   0%|          | 0/377 [00:00<?, ?it/s]

Training loss: 0.2457


Evaluating:   0%|          | 0/19 [00:00<?, ?it/s]

Validation loss: 0.1326

Classification Report:
                  precision    recall  f1-score   support

          normal       0.99      1.00      0.99       224
fraud_investment       0.98      0.91      0.95        67
      fraud_loan       0.67      0.91      0.77        11

        accuracy                           0.97       302
       macro avg       0.88      0.94      0.90       302
    weighted avg       0.98      0.97      0.97       302

Saved best model!

Epoch 2/5


Training:   0%|          | 0/377 [00:00<?, ?it/s]

Training loss: 0.0412


Evaluating:   0%|          | 0/19 [00:00<?, ?it/s]

Validation loss: 0.1417

Classification Report:
                  precision    recall  f1-score   support

          normal       0.99      1.00      0.99       224
fraud_investment       0.95      0.94      0.95        67
      fraud_loan       0.80      0.73      0.76        11

        accuracy                           0.97       302
       macro avg       0.91      0.89      0.90       302
    weighted avg       0.97      0.97      0.97       302


Epoch 3/5


Training:   0%|          | 0/377 [00:00<?, ?it/s]

Training loss: 0.0146


Evaluating:   0%|          | 0/19 [00:00<?, ?it/s]

Validation loss: 0.1762

Classification Report:
                  precision    recall  f1-score   support

          normal       0.98      1.00      0.99       224
fraud_investment       0.95      0.94      0.95        67
      fraud_loan       0.78      0.64      0.70        11

        accuracy                           0.97       302
       macro avg       0.90      0.86      0.88       302
    weighted avg       0.97      0.97      0.97       302


Epoch 4/5


Training:   0%|          | 0/377 [00:00<?, ?it/s]

Training loss: 0.0103


Evaluating:   0%|          | 0/19 [00:00<?, ?it/s]

Validation loss: 0.1762

Classification Report:
                  precision    recall  f1-score   support

          normal       0.98      1.00      0.99       224
fraud_investment       0.95      0.94      0.95        67
      fraud_loan       0.78      0.64      0.70        11

        accuracy                           0.97       302
       macro avg       0.90      0.86      0.88       302
    weighted avg       0.97      0.97      0.97       302


Epoch 5/5


Training:   0%|          | 0/377 [00:00<?, ?it/s]

Training loss: 0.0119


Evaluating:   0%|          | 0/19 [00:00<?, ?it/s]

Validation loss: 0.1762

Classification Report:
                  precision    recall  f1-score   support

          normal       0.98      1.00      0.99       224
fraud_investment       0.95      0.94      0.95        67
      fraud_loan       0.78      0.64      0.70        11

        accuracy                           0.97       302
       macro avg       0.90      0.86      0.88       302
    weighted avg       0.97      0.97      0.97       302



<All keys matched successfully>