In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import json

file_path = 'DiverseVul.VulnerableCode.json'

# Carica il dataset JSON
with open(file_path, 'r') as f:
    data = json.load(f)

# Crea un DataFrame
dv = pd.DataFrame(data)


In [2]:
# Utilizza il metodo shape per ottenere le dimensioni del DataFrame
num_entries = dv.shape[0]

print("Numero totale di entry (funzioni):", num_entries)

# Conta le occorrenze delle classi target
target_counts = dv['target'].value_counts()

# Conta le occorrenze delle vulnerabilità CWE
cwe_counts = dv['cwe'].explode().value_counts()

# Visualizza le distribuzioni
print("Distribuzione delle classi target:")
print(target_counts)

print("\nDistribuzione delle vulnerabilità CWE:")
print(cwe_counts)


Numero totale di entry (funzioni): 330492
Distribuzione delle classi target:
target
0    311547
1     18945
Name: count, dtype: int64

Distribuzione delle vulnerabilità CWE:
cwe
CWE-787    39343
CWE-125    28180
CWE-703    25208
CWE-119    25036
CWE-20     22870
           ...  
CWE-672       11
CWE-91         9
CWE-565        8
CWE-825        8
CWE-805        2
Name: count, Length: 150, dtype: int64


In [None]:
print("Before: ",dv['func'].iloc[0])
print("After:  ",df['func'].iloc[0])

In [4]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

df=dv.copy()

# Dividere le caratteristiche (X) e il target (y)
X = df['func']
y = df['target']

# Suddividere il dataset originale in set di addestramento e di test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Creazione del campionatore
ros = RandomOverSampler(random_state=42)

# Applicare il campionatore solo ai dati di addestramento
X_train_resampled, y_train_resampled = ros.fit_resample(X_train.values.reshape(-1, 1), y_train)

# Convertire i dati resampled back in un formato pandas series
X_train_resampled = pd.Series(X_train_resampled.flatten())


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

#Creazione del pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Trasformazione del testo in feature
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42))
])

In [None]:
pipeline.fit(X_train_resampled, y_train_resampled)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipeline.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
import pickle
file_path = "/content/drive/My Drive/random_forest_model_OVER.pkl"

with open(file_path, 'wb') as file:
    pickle.dump(pipeline, file)