# Week 1: Data Preparation & Baseline Model
## ARM Android Malware Detection Project

**Date:** January 26, 2026  
**Goal:** Load data, perform EDA, create baseline Random Forest model

---

## Setup & Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Random seed
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("‚úÖ Imports successful!")

## 1. Load Data

In [None]:
# Load feature dataset
print("Loading data_sample_25k.csv...")
df_features = pd.read_csv('../data/data_sample_25k.csv')
print(f"Features shape: {df_features.shape}")
print(f"Columns: {df_features.columns.tolist()[:10]}...")  # First 10 columns

# Load labels
print("\nLoading labels...")
df_labels = pd.read_csv('../data/mh_100k_labels.csv')
df_labels_25k = df_labels.head(25000)
print(f"Labels shape: {df_labels_25k.shape}")

# Display first few rows
df_features.head()

## 2. Merge Features with Labels

In [None]:
# Merge on SHA256 if available, otherwise assume aligned
if 'SHA256' in df_features.columns and 'SHA256' in df_labels_25k.columns:
    df = pd.merge(df_features, df_labels_25k[['SHA256', 'CLASS']], on='SHA256', how='inner')
    print("‚úÖ Merged on SHA256")
else:
    df = df_features.copy()
    df['CLASS'] = df_labels_25k['CLASS'].values[:len(df_features)]
    print("‚úÖ Aligned rows")

print(f"\nMerged dataset shape: {df.shape}")
print(f"Columns: {df.shape[1]}")
print(f"Samples: {df.shape[0]}")

df.head()

## 3. Exploratory Data Analysis (EDA)

### 3.1 Class Distribution

In [None]:
# Class distribution
class_counts = df['CLASS'].value_counts()
print("Class Distribution:")
print(class_counts)
print(f"\nBenign (0): {class_counts.get(0, 0)} ({class_counts.get(0, 0)/len(df)*100:.2f}%)")
print(f"Malware (1): {class_counts.get(1, 0)} ({class_counts.get(1, 0)/len(df)*100:.2f}%)")

# Visualize
plt.figure(figsize=(10, 6))
colors = ['#2ecc71', '#e74c3c']  # Green for benign, red for malware
class_counts.plot(kind='bar', color=colors)
plt.title('Class Distribution (0=Benign, 1=Malware)', fontsize=16, fontweight='bold')
plt.xlabel('Class', fontsize=13)
plt.ylabel('Count', fontsize=13)
plt.xticks(rotation=0)
for i, v in enumerate(class_counts):
    plt.text(i, v + 100, str(v), ha='center', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()

# Check for imbalance
imbalance_ratio = class_counts.min() / class_counts.max()
print(f"\nClass Imbalance Ratio: {imbalance_ratio:.3f}")
if imbalance_ratio < 0.3:
    print("‚ö†Ô∏è Significant class imbalance detected!")
else:
    print("‚úÖ Class distribution is reasonable")

### 3.2 Missing Values

In [None]:
# Check missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

if missing.sum() == 0:
    print("‚úÖ No missing values found!")
else:
    print(f"‚ö†Ô∏è Found {missing.sum()} missing values:")
    missing_df = pd.DataFrame({
        'Column': missing[missing > 0].index,
        'Missing': missing[missing > 0].values,
        'Percentage': missing_pct[missing > 0].values
    })
    print(missing_df)

### 3.3 Feature Analysis

In [None]:
# Identify feature columns
metadata_cols = ['SHA256', 'NOME', 'PACOTE', 'API_MIN', 'API', 'CLASS']
feature_cols = [col for col in df.columns if col not in metadata_cols]

print(f"Total Features: {len(feature_cols)}")
print(f"Feature columns: {feature_cols[:10]}...")  # First 10

# Check if features are binary
print("\nFeature Value Distribution (first 5 features):")
for col in feature_cols[:5]:
    unique_vals = df[col].unique()
    print(f"  {col}: {unique_vals}")

### 3.4 Feature Frequency

In [None]:
# Calculate feature presence (how many samples have each feature)
feature_sums = df[feature_cols].sum().sort_values(ascending=False)

print("Top 10 Most Frequent Features:")
print(feature_sums.head(10))

# Plot
plt.figure(figsize=(14, 6))
feature_sums.head(20).plot(kind='bar', color='steelblue')
plt.title('Top 20 Feature Frequency', fontsize=16, fontweight='bold')
plt.xlabel('Features', fontsize=13)
plt.ylabel('Count (Presence in samples)', fontsize=13)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Feature sparsity
sparsity = (df[feature_cols] == 0).sum().sum() / (len(df) * len(feature_cols))
print(f"\nFeature Sparsity: {sparsity*100:.2f}% (percentage of zeros)")

### 3.5 Feature Statistics by Class

In [None]:
# Compare feature presence in benign vs malware
benign_features = df[df['CLASS'] == 0][feature_cols].sum()
malware_features = df[df['CLASS'] == 1][feature_cols].sum()

# Top features for each class
print("Top 10 Features in Benign Apps:")
print(benign_features.sort_values(ascending=False).head(10))

print("\nTop 10 Features in Malware Apps:")
print(malware_features.sort_values(ascending=False).head(10))

# Compare visually
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

benign_features.sort_values(ascending=False).head(15).plot(kind='barh', ax=axes[0], color='green', alpha=0.7)
axes[0].set_title('Top 15 Features in Benign Apps', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Count')

malware_features.sort_values(ascending=False).head(15).plot(kind='barh', ax=axes[1], color='red', alpha=0.7)
axes[1].set_title('Top 15 Features in Malware Apps', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Count')

plt.tight_layout()
plt.show()

## 4. Data Preprocessing

In [None]:
# Handle missing values
if df[feature_cols].isnull().sum().sum() > 0:
    print("Filling missing values with 0...")
    df[feature_cols] = df[feature_cols].fillna(0)
    print("‚úÖ Missing values handled")
else:
    print("‚úÖ No missing values to handle")

# Ensure numeric types
for col in feature_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

print("‚úÖ All features are numeric")

## 5. Train/Test Split (80:20)

In [None]:
# Prepare X and y
X = df[feature_cols].values
y = df['CLASS'].values

print(f"Feature matrix X: {X.shape}")
print(f"Label vector y: {y.shape}")

# Split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print(f"\n‚úÖ Train set: {X_train.shape[0]} samples")
print(f"   - Benign: {(y_train == 0).sum()}")
print(f"   - Malware: {(y_train == 1).sum()}")

print(f"\n‚úÖ Test set: {X_test.shape[0]} samples")
print(f"   - Benign: {(y_test == 0).sum()}")
print(f"   - Malware: {(y_test == 1).sum()}")

## 6. Baseline Random Forest Model

In [None]:
# Train baseline model
print("Training Random Forest with 100 trees...")
rf_baseline = RandomForestClassifier(
    n_estimators=100,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=0
)

rf_baseline.fit(X_train, y_train)
print("‚úÖ Model trained!")

# Predictions
y_train_pred = rf_baseline.predict(X_train)
y_test_pred = rf_baseline.predict(X_test)
print("‚úÖ Predictions complete!")

## 7. Model Evaluation

### 7.1 Performance Metrics

In [None]:
# Calculate metrics
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

# Display results
print("="*80)
print("BASELINE MODEL PERFORMANCE")
print("="*80)
print(f"Training Accuracy:   {train_acc*100:.2f}%")
print(f"Testing Accuracy:    {test_acc*100:.2f}%")
print(f"Precision:           {precision*100:.2f}%")
print(f"Recall:              {recall*100:.2f}%")
print(f"F1-Score:            {f1*100:.2f}%")
print(f"False Positive Rate: {fpr*100:.2f}%")
print("="*80)

# Comparison with ARM paper target
print("\nüìä Comparison with ARM Paper:")
print(f"   Target Accuracy: ~98.6%")
print(f"   Our Accuracy: {test_acc*100:.2f}%")
if test_acc >= 0.95:
    print("   ‚úÖ Great baseline performance!")
elif test_acc >= 0.90:
    print("   ‚úÖ Good baseline performance")
else:
    print("   ‚ö†Ô∏è Room for improvement")

### 7.2 Confusion Matrix

In [None]:
# Confusion matrix visualization
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True,
            xticklabels=['Benign', 'Malware'],
            yticklabels=['Benign', 'Malware'],
            annot_kws={'size': 16, 'weight': 'bold'})
plt.title('Confusion Matrix - Baseline Model', fontsize=16, fontweight='bold')
plt.xlabel('Predicted', fontsize=13)
plt.ylabel('Actual', fontsize=13)
plt.tight_layout()
plt.show()

print(f"True Negatives (TN):  {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"True Positives (TP):  {tp}")

### 7.3 Classification Report

In [None]:
print(classification_report(y_test, y_test_pred, target_names=['Benign', 'Malware']))

### 7.4 Feature Importance

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_baseline.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 20 Important Features:")
print(feature_importance.head(20))

# Plot
plt.figure(figsize=(12, 8))
top_20 = feature_importance.head(20)
plt.barh(range(len(top_20)), top_20['importance'].values, color='steelblue')
plt.yticks(range(len(top_20)), top_20['feature'].values)
plt.xlabel('Importance', fontsize=13)
plt.title('Top 20 Feature Importance - Baseline Model', fontsize=16, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 8. Summary & Next Steps

In [None]:
print("="*80)
print("‚úÖ WEEK 1 COMPLETE!")
print("="*80)
print("\nüìä Summary:")
print(f"   - Dataset: {len(df)} samples with {len(feature_cols)} features")
print(f"   - Malware: {(y == 1).sum()} ({(y == 1).sum()/len(y)*100:.2f}%)")
print(f"   - Benign: {(y == 0).sum()} ({(y == 0).sum()/len(y)*100:.2f}%)")
print(f"   - Baseline Accuracy: {test_acc*100:.2f}%")
print(f"   - F1-Score: {f1*100:.2f}%")

print("\nüéØ Next Steps (Week 2):")
print("   1. Implement Mutual Information feature selection")
print("   2. Select top 50-80 features based on MI scores")
print("   3. Retrain Random Forest with selected features")
print("   4. Compare performance with baseline")
print("="*80)