In [None]:
#Data Preparation: Preprocess the historical log of executed queries to serve as training
#data for the machine learning model. This may involve feature extraction, normalization,
#and splitting the data into training and test sets.
#Model Training: Select a suitable machine learning algorithm (e.g., LSTM,
#Transformer-based models for sequence prediction) to train a model capable of
#predicting future queries based on the historical data.
#Evaluation: Evaluate the model's performance using appropriate metrics (e.g., accuracy,
#precision, recall) on the test set. Adjust the model as necessary to improve performance.
#Prediction: Demonstrate the model's capability by using it to predict a set of upcoming
#queries. Analyze the predictions to understand the model's strengths and weaknesses.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [5]:
# Load your dataset
df = pd.read_csv('/content/drive/MyDrive/data/query_log.csv')

In [6]:
df.head()

Unnamed: 0,query,query-type,execution_time,outcome,error_message
0,UPDATE productlines SET image = 'VBWJH616LC' W...,update,0.046847,True,
1,UPDATE customers SET postalCode = '2AX7KH5N4D'...,update,0.0,False,1292 (22007): Truncated incorrect DECIMAL valu...
2,"INSERT INTO customers (customerNumber, custome...",insert,0.0,False,1366 (HY000): Incorrect decimal value: 'Unknow...
3,"INSERT INTO products (productCode, productName...",insert,0.0,False,1366 (HY000): Incorrect integer value: 'Unknow...
4,"INSERT INTO offices (officeCode, city, phone, ...",insert,0.093958,True,


In [8]:
# Extract features and target
X = df[['query', 'query-type']]  # double square brackets for DataFrame
y = df['outcome']

In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [10]:
# Initialize tokenizer for 'query' (textual data)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X['query'])
X_query_seq = tokenizer.texts_to_sequences(X['query'])



In [11]:
# Pad sequences to ensure uniform length
max_length = max([len(x) for x in X_query_seq])
X_query_padded = pad_sequences(X_query_seq, maxlen=max_length, padding='post')



In [13]:
# Encode 'query_type' (categorical data)
label_encoder = LabelEncoder()
X_query_type_encoded = label_encoder.fit_transform(X['query-type']).reshape(-1, 1)



In [15]:
import numpy as np
X_processed = np.hstack((X_query_padded, X_query_type_encoded))  # Adjust this depending on how you want to combine features


In [16]:
# If 'outcome' is categorical
y_encoded = label_encoder.fit_transform(y)



In [17]:
#Train-Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.2)


In [36]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Build the LSTM model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(LSTM(50, return_sequences=False))
model.add(Dense(25, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])  # Use binary_crossentropy for binary classification

# Train the model
model.fit(X_train, y_train, batch_size=1, epochs=1)

# Make predictions
predictions = model.predict(X_test)
predictions_binary = (predictions > 0.5).astype(int)  # Convert probabilities to binary values

# Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Evaluate the model
accuracy = accuracy_score(y_test, predictions_binary)
precision = precision_score(y_test, predictions_binary)
recall = recall_score(y_test, predictions_binary)

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}")


Accuracy: 0.9359920610308256, Precision: 0.9208042360524902, Recall: 1.0
