In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix

np.random.seed(412)  # For reproducibility
from scipy.sparse import csc_matrix, csr_matrix

## Exercise 1)

In [2]:
#Loading data


def load_data(file_path):
    """Loads the data matrix X and target vector y from a CSV file"""
   
    # load data
    csv_data = pd.read_csv(file_path)
    print(f"Data features: \n {csv_data.columns.values} \n")

    # Separate features (X) and target (y)
    X = csv_data.iloc[:, :-1].values
    y = csv_data.iloc[:, -1].values

    print(y[2])

    num_ones_or_zeros_y=len(y[y == 0])+len(y[y==1])
    
    print(f"The # of ones and zeros: {num_ones_or_zeros_y} is the same as length of y {y.shape[0]}")


    # Convert y to ±1 (y is in {0, 1})
    y = np.where(y == 0, -1,1)

    return X, y


# Load the dataset
X, y = load_data('data.csv')
print(X.shape)

Data features: 
 ['android.permission.GET_ACCOUNTS'
 'com.sonyericsson.home.permission.BROADCAST_BADGE'
 'android.permission.READ_PROFILE' 'android.permission.MANAGE_ACCOUNTS'
 'android.permission.WRITE_SYNC_SETTINGS'
 'android.permission.READ_EXTERNAL_STORAGE'
 'android.permission.RECEIVE_SMS'
 'com.android.launcher.permission.READ_SETTINGS'
 'android.permission.WRITE_SETTINGS'
 'com.google.android.providers.gsf.permission.READ_GSERVICES'
 'android.permission.DOWNLOAD_WITHOUT_NOTIFICATION'
 'android.permission.GET_TASKS'
 'android.permission.WRITE_EXTERNAL_STORAGE'
 'android.permission.RECORD_AUDIO'
 'com.huawei.android.launcher.permission.CHANGE_BADGE'
 'com.oppo.launcher.permission.READ_SETTINGS'
 'android.permission.CHANGE_NETWORK_STATE'
 'com.android.launcher.permission.INSTALL_SHORTCUT'
 'android.permission.android.permission.READ_PHONE_STATE'
 'android.permission.CALL_PHONE' 'android.permission.WRITE_CONTACTS'
 'android.permission.READ_PHONE_STATE'
 'com.samsung.android.provider

In [3]:
# Display data details
print(f"Loaded dataset with {X.shape[0]} samples and {X.shape[1]} features.")
print(f"Number of malicious data points: {np.sum(y == 1)}")
print(f"Number of non-malicious data points: {np.sum(y == -1)}")

sparsity = len(X[X == 0]) / X.size * 100
print(f"{sparsity:.2f}% of X's entries are 0")


Loaded dataset with 29332 samples and 86 features.
Number of malicious data points: 14700
Number of non-malicious data points: 14632
89.01% of X's entries are 0


We dont need one hot encoding since all the data is binary. the data features and many others, are binary features. Each permission is either granted (1) or not granted (0).

## Exercise 2)

In [4]:
## Create test and training sets



def split_data(X, y, r=0.5):
    """
    Splits the data into training and test sets.
    X should be in CSC (Compressed Sparse Column) format.
    y will be returned as a dense vector.
    r: Test size ratio (0 < r < 1).
    """
    # Convert X to sparse CSC matrix (if not already sparse)
    if not isinstance(X, csc_matrix):
        X_sparse = csc_matrix(X)
    else:
        X_sparse = X
    
    # y is now expected to be a dense array, so no need to convert it to sparse
    y_dense = np.array(y)

    # Shuffle and split data
    indices = np.random.permutation(X_sparse.shape[0])
    split_index = int(X_sparse.shape[0] * (1 - r))

    # Splitting X
    X_train = X_sparse[indices[:split_index], :]
    X_test = X_sparse[indices[split_index:], :]
    
    # Splitting y (kept as dense)
    y_train = y_dense[indices[:split_index]]
    y_test = y_dense[indices[split_index:]]

    return X_train, X_test, y_train, y_test


# Split the dataset (50/50 split)
X_train, X_test, y_train, y_test = split_data(X, y, r=0.5)

## Exercise 3)

In [28]:
import numpy as np


def classify(X, y, w):
    """Returns the number of correctly classified points using the weight vector w."""
    # Perform matrix-vector multiplication (X.dot(w)) and predict using the sign function
    predictions = np.sign(X.dot(w))
    
    # Convert sparse vector y to dense for comparison
    if isinstance(y, np.ndarray):  # If y is already dense
        y_dense = y
    else:
        y_dense = y.toarray().flatten()  # Convert sparse matrix to dense and flatten it
    
    # Compare predictions with true labels and count correct classifications
    correct = np.sum(predictions == y_dense)
    accuracy = correct / len(y_dense)
    
    return correct, accuracy

# Example: Try random weight vector
w_random = np.random.randn(X_train.shape[1])  # Random weight vector of appropriate size
correct, accuracy = classify(X_test, y_test, w_random)
print(f"Random classification accuracy: {accuracy * 100:.2f}%")


Random classification accuracy: 39.12%


We can verify that the output makes sense for random weight vectors by calculating the  trials over N trials:

In [62]:
avg_arr=[]
for i in range(0,100):
    w_random = np.random.randn(X_train.shape[1])  # Random weight vector of appropriate size
    correct,accuracy=classify(X_test,y_test,w_random)
    avg_arr.append(accuracy)


print(f"average accuracy is: {np.mean(avg_arr)}")
    

average accuracy is: 0.49492976953497886


As expected, we get a value around 50%


## Exercise 4)

$
J(w) = \sum_{i=1}^{n} L(y_i x_i^T w) + \frac{\lambda}{2} \| w \|^2
$

where
$
L(s) = \log(1 + e^{-s})
$

Gradient for logistic regression

The gradient of the logistic regression cost function with respect to \( w \) is:

$
\frac{\partial J(\underline{\omega})}{\partial \omega_j} = \lambda \omega_j + \sum_{i=1}^n \frac{-\exp(-y_i \underline{x}_i^T \underline{\omega})}{1 + \exp(-y_i \underline{x}_i^T \underline{\omega})} y_i \underline{x}_i^T \underline{e}_j
$

Gradient for logistic regression

The gradient of the logistic regression cost function with respect to \( w \) is:

$
\frac{\partial J(w)}{\partial w} = \frac{1}{m} X^T \left( \sigma(Xw) - y \right) + \frac{\lambda}{m} w
$

Where:
- $\sigma(z)$ is the **sigmoid function**:

$
\sigma(z) = \frac{1}{1 + e^{-z}}
$

- $ X $ is the matrix of input features.
- $ y $ is the vector of target labels.
- $ w $ is the weight vector.
- $ \lambda $ is the regularization constant.
- $ m $ is the number of training examples.
## Exercise 5)


$
\frac{\partial J(W)}{\partial w_j} = \sum_{i=1}^{n} L(y_i, x_i^T w)
$


In [14]:
def sigmoid(z):
    """Sigmoid function."""
    return 1 / (1 + np.exp(-z))

X = X_train
y = y_train

num_steps = 1000
alpha = 0.01
reg_lambda = 10
w = np.zeros(X.shape[1])
for step in range(num_steps):
    big_sigma = sigmoid(X.dot(w) * y)
    B = (big_sigma - 1) * y
    grad = B.dot(X)
    grad += reg_lambda * w
    w -= alpha * grad



correct, accuracy = classify(X_test, y_test, w)
print(f"Test set classification accuracy: {accuracy * 100:.2f}%")
cost = -np.sum( np.log(big_sigma) ) + 0.5*reg_lambda * w.dot(w)
cost / len(y)


ValueError: operands could not be broadcast together with shapes (14666,) (86,) (14666,) 

In [None]:

def sigmoid(z):
    """Sigmoid function."""
    return 1 / (1 + np.exp(-z))



def logistic_regression_cost_grad(X, y, w, reg_lambda):
    """Calculates the cost and gradient for logistic regression."""
    m = len(y)
    z = X.dot(w)
    # z= np.dot(X,w)
    h = sigmoid(z)

    # print("debug")
    # print(np.dot(X,w))


    # Cost function with regularization
    cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h)) + (reg_lambda / (2 * m)) * np.dot(w, w)
    cost = (-1 / m) * np.sum(np.log(sigmoid(y*z)) ) + (reg_lambda / (2 * m)) * np.dot(w, w)
    # print(h[0])
    # Gradient with regularization

    #grad = (1 / m) *  X.transpose().dot(h-y) + (reg_lambda / m) * w


    big_sigma = sigmoid( z * y )
    B = (big_sigma - 1) * y
    grad = (1 / m) *  np.dot(X, B) + (reg_lambda / m) * w
    # grad = (1 / m) * np.dot(X.T, (h - y)) + (reg_lambda / m) * w


    return cost, grad


def logistic_regression(X, y, alpha, reg_lambda, num_steps):
    """Performs gradient descent to find the optimal weight vector for logistic regression."""
    # Initialize weight vector w
    w = np.zeros(X.shape[1])
    # X is sparse and has much less columns than rows,
    X_sparse = csc_matrix(X)

    for step in range(num_steps):
        cost, grad = logistic_regression_cost_grad(X_sparse, y, w, reg_lambda)
        w -= alpha * grad  # Gradient descent update

        # Optional: print progress
        if step % 100 == 0:
            print(f"Step {step}, Cost: {cost}")

    return w


# Train logistic regression with gradient descent
w_trained = logistic_regression(X_train, y_train, alpha=0.01, reg_lambda=10, num_steps=10000)

# Evaluate performance on the test set
correct, accuracy = classify(X_test, y_test, w_trained)
print(f"Test set classification accuracy: {accuracy * 100:.2f}%")

### Exercise 6) 



In [None]:
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd

# Load the dataset with fake points.
csv_data = pd.read_csv("data2.csv")
X = csv_data.iloc[:, :-1].values

# Use PCA with the number of PCA axes the same as our number of features.
# Additionaly whiten the data, meaning dividing each column by its standard deviation.
pca = PCA(whiten=True)
pca.fit(X)
X_pca = pca.transform(X)

# Affirming that each column has a standard deviation of 1.
epsilon = 0.01
assert np.all(np.abs(np.std(X_pca, axis=0) - 1) <= epsilon)

# Calculate the norm of points in the whitened PCA basis.
# Points with a large norm can be identified as potential outliers (refer to Lecture 8).
norm = np.linalg.norm(X_pca, axis=1) 

# Assuming we know that 2000 fake datapoints were added, 
# find the norm threshold that is less than exactly 2000 datapoints
sorted_norm = np.sort(norm)
threshold = sorted_norm[-2000];
outliers = norm > threshold



In [None]:
import matplotlib.pyplot as plt

# One assumption we made is that the PCA coordinates
# should be normally distributed and in fact we can see this is the case.
plt.hist(X_pca[:, 5], bins=30, color='skyblue', edgecolor='black')
plt.title('Whitened PCA Axis 5 Coordinate')
plt.xlabel('PCA Coordinate 5 Value')
plt.ylabel('Frequency')
plt.show()


In [None]:
# This visualization shows the outlier points detected by thresholding the norms 
# of the whitened data, compared to the actual fake points that were added. 
# Most of the true fake points are clustered around (-1, 0) in the
# PCA-1 vs. PCA-2 coordinate space. We can also see that most of fake points were identified.
# Note: We found that the true outliers were the last 2000 points of X in data2.csv.

ax1 = 0
ax2 = 1
plt.scatter(X_pca[:,ax1], X_pca[:,ax2], c=norm)
plt.scatter(X_pca[outliers,ax1], X_pca[outliers,ax2], c="red", label="Detected Outliers")
plt.scatter(X_pca[-2000:,ax1], X_pca[-2000:,ax2],10, c="green", label="True Outliers")
plt.xlabel("PCA Axis 1")
plt.ylabel("PCA Axis 2")
plt.title("Norms of Whitened Data in PCA Basis")
plt.legend()
plt.colorbar()


In [None]:
# Fake points detected vs false positive statistics.

outlier_indices = np.array(np.where(outliers)).T
number_fake_detected = np.sum(outlier_indices > X.shape[0]-2000)
print(f"Number of fake points detected: {number_fake_detected}")
print(f"Number of real points falsely detected as fake: {2000 - number_fake_detected}")
