# COSC 522 Final Project

## Microsoft Malware Prediction Kaggle Competition

### Vineeth Konjeti, Steven Dao, Jonathan Tran

In [4]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/Shareddrives/'522 Final Project'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/Shareddrives/522 Final Project


## Unzipping the data

In [5]:
# import zipfile

# with zipfile.ZipFile('microsoft-malware-prediction.zip', 'r') as zip_obj:
#     zip_obj.extractall('microsoft-malware-prediction/')

## Initial Setup

In [6]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif


import warnings
warnings.filterwarnings('ignore')


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [7]:
train_df = pd.read_csv('microsoft-malware-prediction/train.csv', nrows=1000000)
test_df = pd.read_csv('microsoft-malware-prediction/test.csv', nrows=1000000)

print(f'Training data shape: {train_df.shape}')
print(f'Testing data shape: {test_df.shape}')

Training data shape: (1000000, 83)
Testing data shape: (1000000, 82)


In [8]:
train_df.head()

Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,...,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,0000028988387b115f69f31a3bf04f09,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1735.0,0,7.0,0,,53447.0,...,36144.0,0,,0.0,0,0,0.0,0.0,10.0,0
1,000007535c3f730efa9ea0b7ef1bd645,win8defender,1.1.14600.4,4.13.17134.1,1.263.48.0,0,7.0,0,,53447.0,...,57858.0,0,,0.0,0,0,0.0,0.0,8.0,0
2,000007905a28d863f6d0d597892cd692,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1341.0,0,7.0,0,,53447.0,...,52682.0,0,,0.0,0,0,0.0,0.0,3.0,0
3,00000b11598a75ea8ba1beea8459149f,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1527.0,0,7.0,0,,53447.0,...,20050.0,0,,0.0,0,0,0.0,0.0,3.0,1
4,000014a5f00daa18e76b81417eeb99fc,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1379.0,0,7.0,0,,53447.0,...,19844.0,0,0.0,0.0,0,0,0.0,0.0,1.0,1


## Data Processing

In [9]:
# Function to display missing values
def display_missing(df):
    for col in df.columns.tolist():
        print(f'{col}: {df[col].isnull().sum()} missing values')

# Display missing values in training data
print('Missing values in training data:')
display_missing(train_df)

for col in train_df.columns:
    if col == 'HasDetections':
      continue
    if train_df[col].dtype == 'object':
        train_df[col].fillna(train_df[col].mode()[0], inplace=True)
        test_df[col].fillna(test_df[col].mode()[0], inplace=True)
    else:
        train_df[col].fillna(train_df[col].median(), inplace=True)
        test_df[col].fillna(test_df[col].median(), inplace=True)


Missing values in training data:
MachineIdentifier: 0 missing values
ProductName: 0 missing values
EngineVersion: 0 missing values
AppVersion: 0 missing values
AvSigVersion: 0 missing values
IsBeta: 0 missing values
RtpStateBitfield: 3666 missing values
IsSxsPassiveMode: 0 missing values
DefaultBrowsersIdentifier: 951697 missing values
AVProductStatesIdentifier: 4093 missing values
AVProductsInstalled: 4093 missing values
AVProductsEnabled: 4093 missing values
HasTpm: 0 missing values
CountryIdentifier: 0 missing values
CityIdentifier: 36513 missing values
OrganizationIdentifier: 308936 missing values
GeoNameIdentifier: 19 missing values
LocaleEnglishNameIdentifier: 0 missing values
Platform: 0 missing values
Processor: 0 missing values
OsVer: 0 missing values
OsBuild: 0 missing values
OsSuite: 0 missing values
OsPlatformSubRelease: 0 missing values
OsBuildLab: 3 missing values
SkuEdition: 0 missing values
IsProtected: 4076 missing values
AutoSampleOptIn: 0 missing values
PuaMode: 9997

In [10]:
# Combine train and test data for consistent encoding
combined = pd.concat([train_df.drop('HasDetections', axis=1), test_df], axis=0)

# Identify categorical features
categorical_feats = [col for col in combined.columns if combined[col].dtype == 'object']

# Label encode categorical variables
for col in categorical_feats:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))

# Split back into train and test data
train_df_encoded = combined[:len(train_df)]
test_df_encoded = combined[len(train_df):]

# Add target variable back to train_df_encoded
train_df_encoded['HasDetections'] = train_df['HasDetections'].values


### Reduce Memory Usage

In [11]:
def reduce_mem_usage(df):
    """Function to reduce the memory usage of a DataFrame."""
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Initial memory usage: {start_mem:.2f} MB')

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object and str(col_type)[:3] != 'int':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > -128 and c_max < 127:
                    df[col] = df[col].astype('int8')
                elif c_min > -32768 and c_max < 32767:
                    df[col] = df[col].astype('int16')
                elif c_min > -2147483648 and c_max < 2147483647:
                    df[col] = df[col].astype('int32')
                else:
                    df[col] = df[col].astype('int64')
            else:
                df[col] = df[col].astype('float32')
        elif col_type == object:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Final memory usage: {end_mem:.2f} MB')
    print(f'Reduced by {(100 * (start_mem - end_mem) / start_mem):.1f}%')

    return df

# Apply memory reduction
train_df_encoded = reduce_mem_usage(train_df_encoded)
test_df_encoded = reduce_mem_usage(test_df_encoded)


Initial memory usage: 640.87 MB
Final memory usage: 503.54 MB
Reduced by 21.4%
Initial memory usage: 633.24 MB
Final memory usage: 495.91 MB
Reduced by 21.7%


## Feature Selection

In [12]:
# Separate features and target
X = train_df_encoded.drop(['HasDetections', 'MachineIdentifier'], axis=1)
y = train_df_encoded['HasDetections']
X_test = test_df_encoded.drop('MachineIdentifier', axis=1)



In [13]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)


### LDA

In [14]:
# Initialize LDA
lda = LinearDiscriminantAnalysis(n_components=1)

# Fit LDA on training data
lda.fit(X_scaled, y)

# Transform data
X_lda = lda.transform(X_scaled)
X_test_lda = lda.transform(X_test_scaled)



In [15]:
# prompt: compare the shapes of the data before and after lda

print(f"Original training data shape: {X_scaled.shape}")
print(f"LDA transformed training data shape: {X_lda.shape}")
print(f"Original testing data shape: {X_test_scaled.shape}")
print(f"LDA transformed testing data shape: {X_test_lda.shape}")

Original training data shape: (1000000, 81)
LDA transformed training data shape: (1000000, 1)
Original testing data shape: (1000000, 81)
LDA transformed testing data shape: (1000000, 1)


In [16]:
X_test_lda[:10]

array([[ 0.4610647 ],
       [ 0.67963601],
       [-0.36171516],
       [-0.6486339 ],
       [ 0.82799441],
       [ 0.05313107],
       [-0.67153394],
       [-2.08513637],
       [ 0.40112641],
       [-0.20343294]])

## Processing

### Basic LightGBM Model

In [17]:
# Split data into training and validation sets
X_train_scaled, X_val_scaled, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train_lda, X_val_lda, y_train, y_val = train_test_split(X_lda, y, test_size=0.2, random_state=42)

# LightGBM model for standard scaled data
lgb_model_scaled = lgb.LGBMClassifier(objective='binary', random_state=42)
lgb_model_scaled.fit(X_train_scaled, y_train)

# Make predictions on validation set
y_pred_scaled = lgb_model_scaled.predict_proba(X_val_scaled)[:, 1]
auc_scaled = roc_auc_score(y_val, y_pred_scaled)

print(f"---- AUC score for Standard Scaled data: {auc_scaled} ----")

# LightGBM model for LDA transformed data
lgb_model_lda = lgb.LGBMClassifier(objective='binary', random_state=42)
lgb_model_lda.fit(X_train_lda, y_train)

# Make predictions on validation set
y_pred_lda = lgb_model_lda.predict_proba(X_val_lda)[:, 1]
auc_lda = roc_auc_score(y_val, y_pred_lda)

print(f"---- AUC score for LDA transformed data: {auc_lda} ----")

[LightGBM] [Info] Number of positive: 399833, number of negative: 400167
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.367389 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5163
[LightGBM] [Info] Number of data points in the train set: 800000, number of used features: 75
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499791 -> initscore=-0.000835
[LightGBM] [Info] Start training from score -0.000835
---- AUC score for Standard Scaled data: 0.7133531729841269 ----
[LightGBM] [Info] Number of positive: 399833, number of negative: 400167
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010802 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 800000, number of used features: 1
[LightGBM] [I