In [2]:
import pandas as pd
import numpy as np

In [5]:
query_logs = pd.read_csv("synthetic_query_logs.csv")


In [7]:
query_logs.head(5)


Unnamed: 0,query_id,user_name,execution_time,bytes_scanned,rows_produced,query_text,efficiency_score,complexity_score
0,1,user_c,4.775598,264.219506,812.012261,JOIN,3.073249,2
1,2,user_a,1.10492,5169.721277,868.401595,SELECT,0.167978,1
2,3,user_c,16.493767,2342.308174,914.108147,AGGREGATE,0.39026,1
3,4,user_c,14.43029,6487.210625,516.228975,JOIN,0.079576,2
4,5,user_a,14.851136,1826.227647,506.501132,AGGREGATE,0.277348,1


Normalizing features

In [8]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the data
query_logs = pd.read_csv("synthetic_query_logs.csv")

# Select the columns to normalize
features_to_normalize = ['execution_time', 'bytes_scanned', 'complexity_score']

# Initialize the scaler
scaler = StandardScaler()

# Normalize the features
query_logs[features_to_normalize] = scaler.fit_transform(query_logs[features_to_normalize])

# View the normalized data
print(query_logs.head())


   query_id user_name  execution_time  bytes_scanned  rows_produced  \
0         1    user_c       -0.971688      -1.739819     812.012261   
1         2    user_a       -1.644859      -0.059589     868.401595   
2         3    user_c        1.177324      -1.028033     914.108147   
3         4    user_c        0.798900       0.391677     516.228975   
4         5    user_a        0.876079      -1.204801     506.501132   

  query_text  efficiency_score  complexity_score  
0       JOIN          3.073249          1.644294  
1     SELECT          0.167978         -0.608164  
2  AGGREGATE          0.390260         -0.608164  
3       JOIN          0.079576          1.644294  
4  AGGREGATE          0.277348         -0.608164  


## Building the AI Model
* Label Data for Suboptimal Queries

* Defining of suboptimal queries as those with:
execution_time > 10 seconds
efficiency_score < 0.1

In [13]:
query_logs['label'] = ((query_logs['execution_time'] > 10) | (query_logs['efficiency_score'] < 0.1)).astype(int)

Training ML Model

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Define features and target
X = query_logs[['execution_time', 'bytes_scanned', 'complexity_score']]
y = query_logs['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.83      0.83      0.83        12
           1       0.75      0.75      0.75         8

    accuracy                           0.80        20
   macro avg       0.79      0.79      0.79        20
weighted avg       0.80      0.80      0.80        20



What does it mean?

Precision tells you how many of the predicted positives were actually positive.
Recall tells you how many of the actual positives were correctly identified.
F1-Score gives a balanced measure considering both precision and recall.
Accuracy measures overall correct predictions.
Macro avg and Weighted avg provide averages for all classes, with the weighted average considering class imbalances.


In this case, the model has slightly better performance for class 0 (precision and recall of 0.83) compared to class 1 (precision and recall of 0.75), which is reflected in the overall accuracy and averages.