# SVM and Random Forest Classification of Function Calls from Text
This notebook trains SVM and Random Forest models to classify function call types from natural language input using scikit-learn NLP tools.

In [9]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import make_pipeline

# Helper to load data and extract function call type
def load_data(filepath):
    texts = []
    labels = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip():
                continue
            obj = json.loads(line)
            texts.append(obj['input'])
            # Extract function name before first '('
            label = obj['output'].split('(')[0]
            labels.append(label)
    return texts, labels

# Load train and test data
X_train, y_train = load_data('../../data/train_data.txt')
X_test, y_test = load_data('../../data/test_data.txt')


In [10]:
# SVM pipeline
svm_pipeline = make_pipeline(
    TfidfVectorizer(),
    LinearSVC()
)
svm_pipeline.fit(X_train, y_train)
y_pred_svm = svm_pipeline.predict(X_test)

In [11]:
# Random Forest pipeline
rf_pipeline = make_pipeline(
    TfidfVectorizer(),
    RandomForestClassifier(n_estimators=100, random_state=42)
)
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)

In [12]:
# View a few predictions from each model
for i in range(5):
    print(f"Input: {X_test[i]}")
    print(f"SVM Prediction: {y_pred_svm[i]}, RF Prediction: {y_pred_rf[i]}, True: {y_test[i]}")
    print("-" * 40)

Input: Decrease the zoom by 6 levels.
SVM Prediction: ZoomOut, RF Prediction: ZoomOut, True: ZoomOut
----------------------------------------
Input: Move map view to 51.3595, 25.5972!
SVM Prediction: Move, RF Prediction: Move, True: Move
----------------------------------------
Input: Load the polyline vector using polyline_rivers_Asia_rf9.shp
SVM Prediction: AddVector, RF Prediction: AddVector, True: AddVector
----------------------------------------
Input: Connect to WMS using https://tiles.io/geoserver/wms!
SVM Prediction: AddWMS, RF Prediction: AddWMS, True: AddWMS
----------------------------------------
Input: I prefer the OpenMuseumMap layer for cultural sites.
SVM Prediction: AddLayer, RF Prediction: AddLayer, True: AddLayer
----------------------------------------


In [15]:
from sklearn.metrics import classification_report, accuracy_score

print("SVM Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, digits=3))

SVM Evaluation:
Accuracy: 1.0
              precision    recall  f1-score   support

    AddLayer      1.000     1.000     1.000        32
   AddMarker      1.000     1.000     1.000        27
   AddVector      1.000     1.000     1.000        39
      AddWMS      1.000     1.000     1.000        34
 Cartography      1.000     1.000     1.000        24
        Draw      1.000     1.000     1.000        29
        Move      1.000     1.000     1.000        42
MoveToExtent      1.000     1.000     1.000        51
      ZoomIn      1.000     1.000     1.000        41
     ZoomOut      1.000     1.000     1.000        35

    accuracy                          1.000       354
   macro avg      1.000     1.000     1.000       354
weighted avg      1.000     1.000     1.000       354



In [16]:
print("Random Forest Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, digits=3))

Random Forest Evaluation:
Accuracy: 0.9774011299435028
              precision    recall  f1-score   support

    AddLayer      0.865     1.000     0.928        32
   AddMarker      1.000     1.000     1.000        27
   AddVector      1.000     1.000     1.000        39
      AddWMS      1.000     1.000     1.000        34
 Cartography      1.000     1.000     1.000        24
        Draw      1.000     1.000     1.000        29
        Move      0.925     0.881     0.902        42
MoveToExtent      1.000     0.941     0.970        51
      ZoomIn      1.000     1.000     1.000        41
     ZoomOut      1.000     1.000     1.000        35

    accuracy                          0.977       354
   macro avg      0.979     0.982     0.980       354
weighted avg      0.979     0.977     0.978       354



In [17]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType

# For text-based pipelines (e.g., TfidfVectorizer)
initial_type = [('input', StringTensorType([None, 1]))]

onnx_model = convert_sklearn(svm_pipeline, initial_types=initial_type, target_opset=12)

with open("svm_function_classifier.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

# Export to ONNX with simple array output
onnx_model = convert_sklearn(
    rf_pipeline,
    initial_types=initial_type,
    options={id(rf_pipeline): {'zipmap': False}},  # Disable dict-style output
    target_opset=12
)

# Save the ONNX file
with open("rf_function_classifier.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())


In [18]:
import os

svm_size = os.path.getsize("svm_function_classifier.onnx") / 1024  # size in KB
rf_size = os.path.getsize("rf_function_classifier.onnx") / 1024    # size in KB

print(f"SVM model file size: {svm_size:.2f} KB")
print(f"Random Forest model file size: {rf_size:.2f} KB")

SVM model file size: 148.39 KB
Random Forest model file size: 4116.88 KB
