In [None]:
import json
import pandas as pd
import os
import glob
from sentence_transformers import SentenceTransformer, util
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder , OneHotEncoder
from sklearn.decomposition import PCA

In [437]:
with open('sampleinput.json') as f:
    data = json.load(f)

dataset_path = data["path"]
target_column = data["target"]
LOG_FILE = "Output.json"
threshold = 0.4


In [438]:
def log_error(e):
    try:
        with open(LOG_FILE, "a") as f:
            json.dump({"error": str(e)}, f)
            f.write("\n")
    except:
        pass

In [439]:
try:
    Dataset = pd.read_csv(dataset_path)
except Exception as e:
    log_error(e)

In [440]:
columns=Dataset.columns.tolist()

In [441]:
numprompt=len(data)-2


In [442]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")


In [443]:
candidate_labels = [
    "drop columns from dataset",
    "fill missing values with mean imputation",
    "fill missing values with median imputation",
    "fill missing values with mode imputation",
    "remove duplicate rows",
    "convert or change data types of columns",
    "standardize numeric columns using z-score scaling",
    "normalize numeric columns to range 0 to 1",
    "encode categorical columns using label encoding",
    "reduce dataset dimensions with PCA",
    "filter dataset rows based on conditions"
]



In [444]:
label_embs = embedder.encode(candidate_labels, convert_to_tensor=True)
def classifier(user_input, candidate_labels):
    input_emb = embedder.encode(user_input, convert_to_tensor=True)

    cos_scores = util.cos_sim(input_emb, label_embs)[0]
    scores = cos_scores.tolist()

    label_scores = list(zip(candidate_labels, scores))

    label_scores.sort(key=lambda x: x[1], reverse=True)

    labels_sorted, scores_sorted = zip(*label_scores)

    result = {
        "sequence": user_input,
        "labels": list(labels_sorted),
        "scores": list(scores_sorted)
    }
    return result


In [445]:

def extract_columns_from_text(user_input):
    lower_input = user_input.lower()
    return [col for col in columns if col.lower() in lower_input]

def drop_columns(user_input):
    global Dataset, target_column
    try:
        cols = extract_columns_from_text(user_input)
        cols = [c for c in cols if c != target_column]
        if cols:
            Dataset.drop(columns=cols, inplace=True)
    except Exception as e:
        log_error(e)

def fill_missing_mean(user_input):
    global Dataset, target_column
    try:
        cols = extract_columns_from_text(user_input)
        if not cols:
            cols = Dataset.select_dtypes(include="number").columns.tolist()
            if target_column in cols:
                cols.remove(target_column)
        Dataset[cols] = Dataset[cols].fillna(Dataset[cols].mean())
    except Exception as e:
        log_error(e)

def fill_missing_median(user_input):
    global Dataset, target_column
    try:
        cols = extract_columns_from_text(user_input)
        if not cols:
            cols = Dataset.select_dtypes(include="number").columns.tolist()
            if target_column in cols:
                cols.remove(target_column)
        Dataset[cols] = Dataset[cols].fillna(Dataset[cols].median())
    except Exception as e:
        log_error(e)

def fill_missing_mode(user_input):
    global Dataset, target_column
    try:
        cols = extract_columns_from_text(user_input)
        if not cols:
            cols = Dataset.select_dtypes(include=["object", "category", "string"]).columns.tolist()
            if target_column in cols:
                cols.remove(target_column)
        for col in cols:
            Dataset[col].fillna(Dataset[col].mode()[0], inplace=True)
    except Exception as e:
        log_error(e)

def remove_duplicates(user_input):
    global Dataset
    try:
        Dataset.drop_duplicates(inplace=True)
    except Exception as e:
        log_error(e)

def fix_data_types(user_input):
    global Dataset
    try:
        dtype_dict = {}
        user_lower = user_input.lower()
        for col in columns:
            if f"{col} to int" in user_lower:
                dtype_dict[col] = int
            elif f"{col} to float" in user_lower:
                dtype_dict[col] = float
            elif f"{col} to str" in user_lower:
                dtype_dict[col] = str
            elif f"{col} to date" in user_lower or f"{col} to datetime" in user_lower:
                dtype_dict[col] = "datetime"

        for col, dtype in dtype_dict.items():
            if dtype == "datetime":
                Dataset[col] = pd.to_datetime(Dataset[col], errors="coerce")
            else:
                Dataset[col] = Dataset[col].astype(dtype)
    except Exception as e:
        log_error(e)

def standardize_columns(user_input):
    global Dataset, target_column
    try:
        cols = extract_columns_from_text(user_input)
        if not cols:
            cols = Dataset.select_dtypes(include="number").columns.tolist()
            if target_column in cols:
                cols.remove(target_column)
        scaler = StandardScaler()
        Dataset[cols] = scaler.fit_transform(Dataset[cols])
    except Exception as e:
        log_error(e)

def normalize_columns(user_input):
    global Dataset, target_column
    try:
        cols = extract_columns_from_text(user_input)
        if not cols:
            cols = Dataset.select_dtypes(include="number").columns.tolist()
            if target_column in cols:
                cols.remove(target_column)
        scaler = MinMaxScaler()
        Dataset[cols] = scaler.fit_transform(Dataset[cols])
    except Exception as e:
        log_error(e)

def encode_categorical(user_input):
    global Dataset, target_column
    try:
        cols = extract_columns_from_text(user_input)
        if not cols:
            cols = Dataset.select_dtypes(include=["object", "category", "string"]).columns.tolist()
            if target_column in cols:
                cols.remove(target_column)
        cols = [c for c in cols if c in Dataset.columns]
        for col in cols:
            le = LabelEncoder()
            Dataset[col] = le.fit_transform(Dataset[col].astype(str))
    except Exception as e:
        log_error(e)

def reduce_dimensions(user_input):
    global Dataset, target_column
    try:
        cols = extract_columns_from_text(user_input)
        numeric_cols = [col for col in cols if col in Dataset.select_dtypes(include="number").columns]

        if not numeric_cols:
            numeric_cols = Dataset.select_dtypes(include="number").columns.tolist()
            if target_column in numeric_cols:
                numeric_cols.remove(target_column)

        n_components = 2
        pca = PCA(n_components=n_components)
        reduced = pca.fit_transform(Dataset[numeric_cols])

        for i in range(n_components):
            Dataset[f"PCA_{i+1}"] = reduced[:, i]
    except Exception as e:
        log_error(e)

def filter_rows(user_input):
    global Dataset
    try:
        filtered_df = Dataset.query(user_input)
        Dataset.drop(Dataset.index, inplace=True)
        for col in filtered_df.columns:
            Dataset[col] = filtered_df[col]
    except Exception as e:
        log_error(e)


In [446]:
intent_function_mapping = {
    "drop columns from dataset": drop_columns,
    "fill missing values with mean imputation": fill_missing_mean,
    "fill missing values with median imputation": fill_missing_median,
    "fill missing values with mode imputation": fill_missing_mode,
    "remove duplicate rows": remove_duplicates,
    "convert or change data types of columns": fix_data_types,
    "standardize numeric columns using z-score scaling": standardize_columns,
    "normalize numeric columns to range 0 to 1": normalize_columns,
    "encode categorical columns using label encoding": encode_categorical,
    "reduce dataset dimensions with PCA": reduce_dimensions,
    "filter dataset rows based on conditions": filter_rows
}


In [447]:
for i in range(0, numprompt):
    user_input = data[f"prompt_{i+1}"]
    result = classifier(user_input, candidate_labels)
    action = result['labels'][0]
    if result['scores'][0] >= threshold:
        intent_function_mapping[action](user_input)
    else:
        log_error(f"Unrecognized prompt: {user_input}")



In [None]:

target_dummies = pd.get_dummies(Dataset[target_column], prefix=target_column)

Dataset.drop(columns=[target_column], inplace=True)

Dataset = pd.concat([Dataset, target_dummies], axis=1)

In [451]:
Dataset.head()

Unnamed: 0,frame.time,frame.len,frame.protocols,eth.src,eth.dst,ip.dst,ip.src,ip.flags,ip.ttl,ip.proto,...,tcp.checksum,tcp.options,tcp.pdu.size,udp.srcport,udp.dstport,label_Benign,label_Ingress Tool Transfer,label_TCP Scan,label_Telnet Brute Force,label_Unknown
0,0,67,4,1,3,1,6,1,64,17,...,34084,37750,311.646226,48322.0,53.0,True,False,False,False,False
1,1,83,4,3,1,6,1,1,61,17,...,34084,37750,311.646226,53.0,48322.0,True,False,False,False,False
2,2,90,5,1,3,2,6,1,64,17,...,34084,37750,311.646226,46343.0,123.0,True,False,False,False,False
3,3,90,5,3,1,6,2,1,61,17,...,34084,37750,311.646226,123.0,46343.0,True,False,False,False,False
4,4,76,4,1,3,1,6,1,64,17,...,34084,37750,311.646226,36848.0,53.0,True,False,False,False,False


In [452]:
Dataset.to_csv("Processed_Dataset.csv", index=False)

In [None]:
with open(LOG_FILE, "r") as f:
    data=json.load(f)

data["Processed_dataset_path"]="Processed_Dataset.csv"