In [130]:
import json
import pandas as pd
import os
import glob
from transformers import pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA

In [131]:
with open('sampleinput.json') as f:
    data = json.load(f)

dataset_path = data["path"]
target_column = data["target"]
LOG_FILE = "Output.json"


In [132]:
def log_error(e):
    try:
        with open(LOG_FILE, "a") as f:
            json.dump({"error": str(e)}, f)
            f.write("\n")
    except:
        pass

In [133]:
try:
    Dataset = pd.read_csv(dataset_path)
except Exception as e:
    log_error(e)

In [134]:
columns=Dataset.columns.tolist()

In [135]:
numprompt=len(data)-2


In [136]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")


Device set to use cpu


In [137]:
candidate_labels = [
    "drop or remove columns from dataset",             
    "fill missing values with mean",                   
    "fill missing values with median",                  
    "fill missing values with mode",                   
    "remove duplicate rows from dataset",             
    "convert or fix data types of columns",           
    "standardize or scale numeric features",         
    "normalize numeric features to range [0,1]",       
    "encode categorical features (one-hot or label)", 
    "reduce dimensions using PCA or similar methods",  
    "filter rows based on conditions or criteria"]


In [138]:

def extract_columns_from_text(user_input):
    lower_input = user_input.lower()
    return [col for col in columns if col.lower() in lower_input]

def drop_columns(user_input):
    global Dataset
    try:
        cols = extract_columns_from_text(user_input)
        Dataset.drop(columns=cols, inplace=True)
    except Exception as e:
        log_error(e)

def fill_missing_mean(user_input):
    global Dataset
    try:
        cols = extract_columns_from_text(user_input)
        Dataset[cols] = Dataset[cols].fillna(Dataset[cols].mean())
    except Exception as e:
        log_error(e)

def fill_missing_median(user_input):
    global Dataset
    try:
        cols = extract_columns_from_text(user_input)
        Dataset[cols] = Dataset[cols].fillna(Dataset[cols].median())
    except Exception as e:
        log_error(e)

def fill_missing_mode(user_input):
    global Dataset
    try:
        cols = extract_columns_from_text(user_input)
        for col in cols:
            Dataset[col].fillna(Dataset[col].mode()[0], inplace=True)
    except Exception as e:
        log_error(e)

def remove_duplicates(user_input):
    global Dataset
    try:
        Dataset.drop_duplicates(inplace=True)
    except Exception as e:
        log_error(e)

def fix_data_types(user_input):
    global Dataset
    try:
        dtype_dict = {}
        user_lower = user_input.lower()
        for col in columns:
            if f"{col} to int" in user_lower:
                dtype_dict[col] = int
            elif f"{col} to float" in user_lower:
                dtype_dict[col] = float
            elif f"{col} to str" in user_lower:
                dtype_dict[col] = str
            elif f"{col} to date" in user_lower or f"{col} to datetime" in user_lower:
                dtype_dict[col] = "datetime"

        for col, dtype in dtype_dict.items():
            if dtype == "datetime":
                Dataset[col] = pd.to_datetime(Dataset[col], errors="coerce")
            else:
                Dataset[col] = Dataset[col].astype(dtype)
    except Exception as e:
        log_error(e)


def standardize_columns(user_input):
    global Dataset
    try:
        cols = extract_columns_from_text(user_input)
        if not cols:
            cols = Dataset.select_dtypes(include='number').columns.tolist()
            if "target" in cols:
                cols.remove("target")
        scaler = StandardScaler()
        Dataset[cols] = scaler.fit_transform(Dataset[cols])
    except Exception as e:
        log_error(e)


def normalize_columns(user_input):
    global Dataset
    try:
        cols = extract_columns_from_text(user_input)
        if not cols:
            cols = Dataset.select_dtypes(include='number').columns.tolist()
            if "target" in cols:
                cols.remove("target")
        scaler = MinMaxScaler()
        Dataset[cols] = scaler.fit_transform(Dataset[cols])
    except Exception as e:
        log_error(e)


def encode_categorical(user_input):
    global Dataset
    try:
        cols = extract_columns_from_text(user_input)
        le = LabelEncoder()
        for col in cols:
            Dataset[col] = le.fit_transform(Dataset[col])
    except Exception as e:
        log_error(e)

def reduce_dimensions(user_input):
    global Dataset
    try:
        cols = extract_columns_from_text(user_input)
        numeric_cols = [col for col in cols if col in Dataset.select_dtypes(include='number').columns]

        if not numeric_cols:
            numeric_cols = Dataset.select_dtypes(include='number').columns.tolist()
            if "target" in numeric_cols:
                numeric_cols.remove("target")

        n_components = 2
        pca = PCA(n_components=n_components)
        reduced = pca.fit_transform(Dataset[numeric_cols])
        for i in range(n_components):
            Dataset[f"PCA_{i+1}"] = reduced[:, i]
    except Exception as e:
        log_error(e)


def filter_rows(user_input):
    global Dataset
    try:
        filtered_df = Dataset.query(user_input)
        Dataset.drop(Dataset.index, inplace=True)
        for col in filtered_df.columns:
            Dataset[col] = filtered_df[col]
    except Exception as e:
        log_error(e)


In [139]:
intent_function_mapping = {
    "drop or remove columns from dataset": drop_columns,
    "fill missing values with mean": fill_missing_mean,
    "fill missing values with median": fill_missing_median,
    "fill missing values with mode": fill_missing_mode,
    "remove duplicate rows from dataset": remove_duplicates,
    "convert or fix data types of columns": fix_data_types,
    "standardize or scale numeric features": standardize_columns,
    "normalize numeric features to range [0,1]": normalize_columns,
    "encode categorical features (one-hot or label)": encode_categorical,
    "reduce dimensions using PCA or similar methods": reduce_dimensions,
    "filter rows based on conditions or criteria": filter_rows
}

In [140]:
for i in range(0, numprompt):
    user_input = data[f"prompt_{i+1}"]
    result = classifier(user_input, candidate_labels)
    print(result)
    action = result['labels'][0]
    if action in intent_function_mapping:
        intent_function_mapping[action](user_input)
    else:
        log_error(f"Unrecognized action: {action}")



{'sequence': 'label all columns', 'labels': ['convert or fix data types of columns', 'standardize or scale numeric features', 'encode categorical features (one-hot or label)', 'filter rows based on conditions or criteria', 'fill missing values with mean', 'fill missing values with median', 'remove duplicate rows from dataset', 'fill missing values with mode', 'normalize numeric features to range [0,1]', 'drop or remove columns from dataset', 'reduce dimensions using PCA or similar methods'], 'scores': [0.3298444151878357, 0.2231069803237915, 0.17421609163284302, 0.06572872400283813, 0.0439617782831192, 0.04132398962974548, 0.03972584754228592, 0.036816686391830444, 0.024962954223155975, 0.012689097784459591, 0.007623337674885988]}


In [141]:
result

{'sequence': 'label all columns',
 'labels': ['convert or fix data types of columns',
  'standardize or scale numeric features',
  'encode categorical features (one-hot or label)',
  'filter rows based on conditions or criteria',
  'fill missing values with mean',
  'fill missing values with median',
  'remove duplicate rows from dataset',
  'fill missing values with mode',
  'normalize numeric features to range [0,1]',
  'drop or remove columns from dataset',
  'reduce dimensions using PCA or similar methods'],
 'scores': [0.3298444151878357,
  0.2231069803237915,
  0.17421609163284302,
  0.06572872400283813,
  0.0439617782831192,
  0.04132398962974548,
  0.03972584754228592,
  0.036816686391830444,
  0.024962954223155975,
  0.012689097784459591,
  0.007623337674885988]}

In [142]:
Dataset.head()

Unnamed: 0,frame.time,frame.len,frame.protocols,eth.src,eth.dst,ip.dst,ip.src,ip.flags,ip.ttl,ip.proto,...,tcp.dstport,tcp.flags,tcp.window_size_value,tcp.window_size_scalefactor,tcp.checksum,tcp.options,tcp.pdu.size,udp.srcport,udp.dstport,label
0,"Jan 14, 2025 18:40:22.447710000 GMT",67,eth:ethertype:ip:udp:dns,02:42:52:d7:fa:00,0c:6e:9c:16:00:00,192.168.0.2,192.168.18.17,0x02,64,17,...,,,,,,,,48322.0,53.0,Benign
1,"Jan 14, 2025 18:40:22.453402000 GMT",83,eth:ethertype:ip:udp:dns,0c:6e:9c:16:00:00,02:42:52:d7:fa:00,192.168.18.17,192.168.0.2,0x02,61,17,...,,,,,,,,53.0,48322.0,Benign
2,"Jan 14, 2025 18:40:22.453507000 GMT",90,eth:ethertype:ip:udp:ntp,02:42:52:d7:fa:00,0c:6e:9c:16:00:00,192.168.0.3,192.168.18.17,0x02,64,17,...,,,,,,,,46343.0,123.0,Benign
3,"Jan 14, 2025 18:40:22.458119000 GMT",90,eth:ethertype:ip:udp:ntp,0c:6e:9c:16:00:00,02:42:52:d7:fa:00,192.168.18.17,192.168.0.3,0x02,61,17,...,,,,,,,,123.0,46343.0,Benign
4,"Jan 14, 2025 18:40:22.560013000 GMT",76,eth:ethertype:ip:udp:dns,02:42:52:d7:fa:00,0c:6e:9c:16:00:00,192.168.0.2,192.168.18.17,0x02,64,17,...,,,,,,,,36848.0,53.0,Benign
