In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler



In [2]:
import re

def avg_length_string_variables(source_code):
    string_assignments = re.findall(r'=\s*"[^"]*"', source_code)
    if not string_assignments:
        return 0
    lengths = [len(s) - 3 for s in string_assignments] 
    return sum(lengths) / len(lengths) if lengths else 0

def check_for_real_words(source_code):
    return 1000 if len(source_code) == 1 else 0

def count_integer_variables(source_code):
    int_vars = re.findall(r'\bDim\s+\w+\s+As\s+Integer\b', source_code)
    return len(int_vars) / len(source_code.split())

def count_string_variables(source_code):
    string_vars = re.findall(r'\bDim\s+\w+\s+As\s+String\b', source_code)
    return len(string_vars) / len(source_code.split()) 

def has_macro_keywords(source_code):
    keywords = ['autoopen', 'autoclose', 'documentopen', 'documentclose', '13', 'cells', 'value']
    return any(keyword in source_code.lower() for keyword in keywords)


def max_consecutive_math_operations(source_code):
    operations_pattern = re.compile(r'[\+\-\*/]{2,}') 
    matches = operations_pattern.findall(source_code)
    if matches:
        return max(len(match) for match in matches)
    return 0


In [3]:
def extract_features(source_code):
    features = {
        'avg_var_assignment_length': avg_length_string_variables(source_code),
        'count_int_vars': count_integer_variables(source_code),
        'count_string_vars': count_string_variables(source_code),
        'macro_keywords': int(has_macro_keywords(source_code)), 
        'max_consecutive_math_ops': max_consecutive_math_operations(source_code),
        'one_char': check_for_real_words(source_code),
    }
    return features


validation_file_path = "validation_dataset.csv"  
df_validation = pd.read_csv(validation_file_path, encoding='utf-16-le') 

df_validation['label']=df_validation['label'].map({'white':1,'mal':0})
x_validation = df_validation["vba_code"]
y_validation = df_validation["label"]

test_file_path = 'test_dataset_without_labels.csv' 
df_test = pd.read_csv(test_file_path, encoding='utf-16-le')
x_test = df_test['vba_code']

# Correctly convert Series to DataFrame if necessary and ensure structure
x_validation_df = pd.DataFrame(x_validation, columns=['vba_code']) if isinstance(x_validation, pd.Series) else x_validation
x_test_df = pd.DataFrame(x_test, columns=['vba_code']) if isinstance(x_test, pd.Series) else x_test

# Apply feature extraction
new_features_val = x_validation_df['vba_code'].apply(extract_features)
new_features_test = x_test_df['vba_code'].apply(extract_features)

# Convert the result into DataFrames
new_features_val_df = pd.DataFrame(new_features_val.tolist())
new_features_test_df = pd.DataFrame(new_features_test.tolist())

# Concatenate the new features with the original DataFrames
x_validation_combined = pd.concat([x_validation_df.reset_index(drop=True), new_features_val_df], axis=1)
x_test_combined = pd.concat([x_test_df.reset_index(drop=True), new_features_test_df], axis=1)

print(x_validation_combined.columns)
print(x_test_combined.columns)

Index(['vba_code', 'avg_var_assignment_length', 'count_int_vars',
       'count_string_vars', 'macro_keywords', 'max_consecutive_math_ops',
       'one_char'],
      dtype='object')
Index(['vba_code', 'avg_var_assignment_length', 'count_int_vars',
       'count_string_vars', 'macro_keywords', 'max_consecutive_math_ops',
       'one_char'],
      dtype='object')


In [4]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, classification_report


with open('train.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

try:
    validation_predictions = loaded_model.predict(x_validation_combined)
    
except ValueError as e:
    print("ValueError:", e)
except KeyError as e:
    print("KeyError:", e)

precision = precision_score(y_validation, validation_predictions )
recall = recall_score(y_validation, validation_predictions )

print('Precision: {} / Recall: {} / Accuracy: {}'.format(
      round(precision, 3), round(recall, 3), round((validation_predictions ==y_validation).sum()/len(validation_predictions ), 3)))


print(loaded_model.named_steps) 
print(classification_report(y_validation, validation_predictions))
print(confusion_matrix(y_validation, validation_predictions, labels=np.unique(y_validation)))

Precision: 0.993 / Recall: 0.998 / Accuracy: 0.995
{'preprocessor': ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['avg_var_assignment_length', 'count_int_vars',
                                  'count_string_vars', 'macro_keywords',
                                  'max_consecutive_math_ops', 'one_char']),
                                ('text', TfidfVectorizer(), 'vba_code')]), 'feature_selection': SelectFromModel(estimator=RandomForestClassifier(max_depth=10, random_state=42)), 'classification': RandomForestClassifier(random_state=42)}
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      5320
           1       0.99      1.00      1.00      5309

    accuracy                           1.00     10629
   macro avg       1.00      1.00      1.00     10629
weighted avg       1.00      1.00      1.00     10629

[[5285   35]
 [  13 5296]]


In [5]:

test_prediction = loaded_model.predict(x_test_combined)


dictionary = {
    1: 'white',
    0: 'mal'
}

predictions = [dictionary[i] for i in test_prediction]

predictions_df = pd.DataFrame(predictions, columns=['prediction'])

predictions_df.to_csv('test_prediction.csv', index=False)