In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler


In [2]:
import re

def avg_length_string_variables(source_code):
    string_assignments = re.findall(r'=\s*"[^"]*"', source_code)
    if not string_assignments:
        return 0
    lengths = [len(s) - 3 for s in string_assignments] 
    return sum(lengths) / len(lengths) if lengths else 0

def check_for_real_words(source_code):
    return 1000 if len(source_code) == 1 else 0

def count_integer_variables(source_code):
    int_vars = re.findall(r'\bDim\s+\w+\s+As\s+Integer\b', source_code)
    return len(int_vars) / len(source_code.split())

def count_string_variables(source_code):
    string_vars = re.findall(r'\bDim\s+\w+\s+As\s+String\b', source_code)
    return len(string_vars) / len(source_code.split()) 

def has_macro_keywords(source_code):
    keywords = ['autoopen', 'autoclose', 'documentopen', 'documentclose', '13', 'cells', 'value']
    return any(keyword in source_code.lower() for keyword in keywords)


def max_consecutive_math_operations(source_code):
    operations_pattern = re.compile(r'[\+\-\*/]{2,}') 
    matches = operations_pattern.findall(source_code)
    if matches:
        return max(len(match) for match in matches)
    return 0


In [3]:
def extract_features(source_code):
    features = {
        'avg_var_assignment_length': avg_length_string_variables(source_code),
        'count_int_vars': count_integer_variables(source_code),
        'count_string_vars': count_string_variables(source_code),
        'macro_keywords': int(has_macro_keywords(source_code)), 
        'max_consecutive_math_ops': max_consecutive_math_operations(source_code),
        'one_char': check_for_real_words(source_code),
    }
    return features


# Load the dataset
train_file_path = "train_dataset.csv"
df = pd.read_csv(train_file_path, encoding='utf-16-le')

df['label']=df['label'].map({'white':1,'mal':0})

# Separate the features and the target variable
x_train = df['vba_code']  
y_train = df['label']

# Convert x_train to a DataFrame
x_train_df = x_train.to_frame()

# Apply the feature extraction function to each row of x_train DataFrame
new_features = x_train_df.apply(lambda row: extract_features(row['vba_code']), axis=1)

# Convert the result into a DataFrame
new_features_df = pd.DataFrame(new_features.tolist())

# Combine original df with new features
x_train_combined = pd.concat([df, new_features_df], axis=1)


x_train_ready = x_train_combined.drop(['label' ], axis=1)

print(type(x_train_ready))
print(x_train_ready.shape)
print(x_train_ready.columns)


<class 'pandas.core.frame.DataFrame'>
(31888, 7)
Index(['vba_code', 'avg_var_assignment_length', 'count_int_vars',
       'count_string_vars', 'macro_keywords', 'max_consecutive_math_ops',
       'one_char'],
      dtype='object')


In [4]:
numeric_features = ['avg_var_assignment_length', 'count_int_vars', 'count_string_vars', 'macro_keywords', 'max_consecutive_math_ops','one_char']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('text', TfidfVectorizer(), 'vba_code'),
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42))),
   ('classification', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [5]:

try:
    pipeline.fit(x_train_ready, y_train)

    with open('train.pkl', 'wb') as file:
        pickle.dump(pipeline, file)
    
except ValueError as e:
    print("ValueError:", e)
except KeyError as e:
    print("KeyError:", e)

print(pipeline.named_steps['preprocessor'].transformers)

[('num', StandardScaler(), ['avg_var_assignment_length', 'count_int_vars', 'count_string_vars', 'macro_keywords', 'max_consecutive_math_ops', 'one_char']), ('text', TfidfVectorizer(), 'vba_code')]
