In [1]:
import sqlite3
import pandas as pd
from CONSTANTS import GLOBAL_CONNECTION

In [2]:
conn = GLOBAL_CONNECTION
cursor = conn.cursor()

In [46]:
# Correct the data definition and complete the CSV generation process
data = {
    "question": [
        "How many tracks are there in the database?",
        "What is the total revenue from all invoices?",
        "What is the name of the most popular genre by number of tracks?",
        "What is the highest amount ever billed to a single customer for an invoice?",
        "What is the name of the track that has generated the most revenue?",
        "Which customer has spent the most money in total?",
        "Which artist's tracks are the most purchased?",
        "Which employee has generated the most revenue from their customers?",
        "What is the name of the most popular playlist by number of tracks included?",
        "Which genre has generated the highest total revenue?",
        "What is the duration in minutes of the longest track recorded by the artist 'AC/DC'?",
        "Which employee's customers have the highest average invoice total?",
        "Who is the customer that has purchased tracks from the highest number of different genres?",
        "On which day of the week does the store earn the most revenue on average?",
        "Which track appears in the most playlists?",
        "Which composer's tracks have generated the most revenue?"
    ],
    "sql_query": [
        "SELECT COUNT(*) FROM Track;",
        "SELECT SUM(Total) FROM Invoice;",
        "SELECT Name FROM Genre WHERE GenreId = (SELECT GenreId FROM Track GROUP BY GenreId ORDER BY COUNT(*) DESC LIMIT 1);",
        "SELECT MAX(Total) FROM Invoice;",
        "SELECT T.Name FROM Track T JOIN InvoiceLine IL ON T.TrackId = IL.TrackId GROUP BY T.TrackId ORDER BY SUM(IL.UnitPrice * IL.Quantity) DESC LIMIT 1;",
        "SELECT FirstName|| ' ' ||LastName FROM Customer WHERE CustomerId = (SELECT CustomerId FROM Invoice GROUP BY CustomerId ORDER BY SUM(Total) DESC LIMIT 1);",
        "SELECT A.Name FROM Artist A JOIN Album Al ON A.ArtistId = Al.ArtistId JOIN Track T ON Al.AlbumId = T.AlbumId JOIN InvoiceLine IL ON T.TrackId = IL.TrackId GROUP BY A.ArtistId ORDER BY COUNT(*) DESC LIMIT 1;",
        "SELECT E.FirstName|| ' ' ||E.LastName FROM Employee E JOIN Customer C ON E.EmployeeId = C.SupportRepId JOIN Invoice I ON C.CustomerId = I.CustomerId GROUP BY E.EmployeeId ORDER BY SUM(I.Total) DESC LIMIT 1;",
        "SELECT P.Name FROM Playlist P JOIN PlaylistTrack PT ON P.PlaylistId = PT.PlaylistId GROUP BY P.PlaylistId ORDER BY COUNT(*) DESC LIMIT 1;",
        "SELECT G.Name FROM Genre G JOIN Track T ON G.GenreId = T.GenreId JOIN InvoiceLine IL ON T.TrackId = IL.TrackId GROUP BY G.GenreId ORDER BY SUM(IL.UnitPrice * IL.Quantity) DESC LIMIT 1;",
        "SELECT MAX(T.Milliseconds) / 60000 AS LongestTrackMinutes FROM Track T JOIN Album Al ON T.AlbumId = Al.AlbumId JOIN Artist A ON Al.ArtistId = A.ArtistId WHERE A.Name = 'AC/DC';",
        "SELECT E.FirstName|| ' ' || E.LastName FROM Employee E JOIN Customer C ON E.EmployeeId = C.SupportRepId JOIN Invoice I ON C.CustomerId = I.CustomerId GROUP BY E.EmployeeId ORDER BY AVG(I.Total) DESC LIMIT 1;",
        "SELECT FirstName|| ' ' ||LastName FROM Customer WHERE CustomerId = ( SELECT C.CustomerId FROM Customer C JOIN Invoice I ON C.CustomerId = I.CustomerId JOIN InvoiceLine IL ON I.InvoiceId = IL.InvoiceId JOIN Track T ON IL.TrackId = T.TrackId JOIN Genre G ON T.GenreId = G.GenreId GROUP BY C.CustomerId ORDER BY COUNT(DISTINCT G.GenreId) DESC LIMIT 1 );",
        "SELECT strftime('%w', InvoiceDate) AS DayOfWeek, AVG(Total) AS AvgRevenue FROM Invoice GROUP BY DayOfWeek ORDER BY AvgRevenue DESC LIMIT 1;",
        "SELECT T.Name FROM Track T JOIN PlaylistTrack PT ON T.TrackId = PT.TrackId GROUP BY T.TrackId ORDER BY COUNT(*) DESC LIMIT 1;",
        "SELECT Composer FROM Track JOIN InvoiceLine ON Track.TrackId = InvoiceLine.TrackId WHERE Composer IS NOT NULL GROUP BY Composer ORDER BY SUM(InvoiceLine.UnitPrice * InvoiceLine.Quantity) DESC LIMIT 1;"
    ],
    "tables_needed": [
        "Track",
        "Invoice",
        "Genre, Track",
        "Invoice",
        "Track, InvoiceLine",
        "Customer, Invoice",
        "Artist, Album, Track, InvoiceLine",
        "Employee, Customer, Invoice",
        "Playlist, PlaylistTrack",
        "Genre, Track, InvoiceLine",
        "Track, Album, Artist",
        "Employee, Customer, Invoice",
        "Customer, Invoice, InvoiceLine, Track, Genre",
        "Invoice",
        "Track, PlaylistTrack",
        "Track, InvoiceLine"
    ],
    "sql_query_soft_eval": [
        "SELECT COUNT(*) FROM Track;",
        "SELECT SUM(Total) FROM Invoice;",
        "SELECT Genre.Name, COUNT(*) AS TrackCount FROM Genre JOIN Track ON Genre.GenreId = Track.GenreId GROUP BY Genre.GenreId ORDER BY TrackCount DESC LIMIT 1;",
        "SELECT MAX(Total) AS HighestTotal, InvoiceId FROM Invoice GROUP BY CustomerId ORDER BY HighestTotal DESC LIMIT 1;",
        "SELECT Track.Name, SUM(InvoiceLine.UnitPrice * InvoiceLine.Quantity) AS TotalRevenue FROM Track JOIN InvoiceLine ON Track.TrackId = InvoiceLine.TrackId GROUP BY Track.TrackId ORDER BY TotalRevenue DESC LIMIT 1;",
        "SELECT Customer.FirstName || ' ' || Customer.LastName AS FullName, SUM(Total) AS TotalSpent FROM Customer JOIN Invoice ON Customer.CustomerId = Invoice.CustomerId GROUP BY Customer.CustomerId ORDER BY TotalSpent DESC LIMIT 1;",
        "SELECT Artist.Name, COUNT(*) AS TotalPurchases FROM Artist JOIN Album ON Artist.ArtistId = Album.ArtistId JOIN Track ON Album.AlbumId = Track.AlbumId JOIN InvoiceLine ON Track.TrackId = InvoiceLine.TrackId GROUP BY Artist.ArtistId ORDER BY TotalPurchases DESC LIMIT 1;",
        "SELECT Customer.FirstName || ' ' || Customer.LastName AS FullName, SUM(Invoice.Total) AS TotalRevenue FROM Employee JOIN Customer ON Employee.EmployeeId = Customer.SupportRepId JOIN Invoice ON Customer.CustomerId = Invoice.CustomerId GROUP BY Employee.EmployeeId ORDER BY TotalRevenue DESC LIMIT 1;",
        "SELECT Playlist.Name, COUNT(*) AS TrackCount FROM Playlist JOIN PlaylistTrack ON Playlist.PlaylistId = PlaylistTrack.PlaylistId GROUP BY Playlist.PlaylistId ORDER BY TrackCount DESC LIMIT 1;",
        "SELECT Genre.Name, SUM(InvoiceLine.UnitPrice * InvoiceLine.Quantity) AS TotalRevenue FROM Genre JOIN Track ON Genre.GenreId = Track.GenreId JOIN InvoiceLine ON Track.TrackId = InvoiceLine.TrackId GROUP BY Genre.GenreId ORDER BY TotalRevenue DESC LIMIT 1;",
        "SELECT Track.Name, MAX(Track.Milliseconds) / 60000 AS DurationMinutes FROM Track JOIN Album ON Track.AlbumId = Album.AlbumId JOIN Artist ON Album.ArtistId = Artist.ArtistId WHERE Artist.Name = 'AC/DC' GROUP BY Artist.ArtistId ORDER BY DurationMinutes DESC LIMIT 1;",
        "SELECT Employee.FirstName || ' ' || Employee.LastName AS FullName, AVG(Invoice.Total) AS AverageInvoiceTotal FROM Employee JOIN Customer ON Employee.EmployeeId = Customer.SupportRepId JOIN Invoice ON Customer.CustomerId = Invoice.CustomerId GROUP BY Employee.EmployeeId ORDER BY AverageInvoiceTotal DESC LIMIT 1;",
        "SELECT Customer.FirstName || ' ' || Customer.LastName AS FullName, COUNT(DISTINCT Genre.GenreId) AS GenreCount FROM Customer JOIN Invoice ON Customer.CustomerId = Invoice.CustomerId JOIN InvoiceLine ON Invoice.InvoiceId = InvoiceLine.InvoiceId JOIN Track ON InvoiceLine.TrackId = Track.TrackId JOIN Genre ON Track.GenreId = Genre.GenreId GROUP BY Customer.CustomerId ORDER BY GenreCount DESC LIMIT 1;",
        "SELECT strftime('%w', InvoiceDate) AS DayOfWeek, AVG(Total) AS AverageRevenue FROM Invoice GROUP BY DayOfWeek ORDER BY AverageRevenue DESC LIMIT 1;",
        "SELECT Track.Name, COUNT(*) AS PlaylistCount FROM Track JOIN PlaylistTrack ON Track.TrackId = PlaylistTrack.TrackId GROUP BY Track.TrackId ORDER BY PlaylistCount DESC LIMIT 1;",
        "SELECT Composer, SUM(InvoiceLine.UnitPrice * InvoiceLine.Quantity) AS TotalRevenue FROM Track JOIN InvoiceLine ON Track.TrackId = InvoiceLine.TrackId WHERE Composer IS NOT NULL GROUP BY Composer ORDER BY TotalRevenue DESC LIMIT 1;"
    ],
    "answer_soft_eval": [
        "There are 3,503 tracks in the database.",
        "Total revenue from all invoices is approximately $2,328.60.",
        "The most popular genre is Rock with 1,297 tracks.",
        "The highest invoice total is $25.86 for invoice ID 404.",
        "The track 'The Woman King' generated the most revenue at $3.98.",
        "Helena Holý spent the most money, totaling approximately $49.62.",
        "Iron Maiden's tracks were purchased 140 times.",
        "Fynn Zimmermann generated the most revenue at $833.04.",
        "The 'Music' playlist contains 3,290 tracks.",
        "Rock generated the highest total revenue at approximately $826.65.",
        "The track 'Overdose' by AC/DC has a duration of 6 minutes.",
        "Steve Johnson's customers have the highest average invoice total at approximately $5.72.",
        "Luis Rojas purchased tracks from 12 different genres.",
        "Wednesday is the most lucrative day, averaging about $5.91 per invoice.",
        "The track 'Intoitus: Adorate Deum' appears in 5 playlists.",
        "Tracks composed by Steve Harris generated revenue totaling $57.42."
    ]
}

In [50]:
ans = []
for query in data['sql_query']:
    answer = cursor.execute(query).fetchall()[0][0]
    ans.append(answer)
print(ans)
data['answer'] = ans

[3503, 2328.600000000004, 'Rock', 25.86, 'The Woman King', 'Helena Holý', 'Iron Maiden', 'Jane Peacock', 'Music', 'Rock', 6, 'Steve Johnson', 'Luis Rojas', '4', 'Intoitus: Adorate Deum', 'Steve Harris']


In [51]:
detailed_ans = []
for query in data['sql_query_soft_eval']:
    answer = cursor.execute(query).fetchall()[0]
    detailed_ans.append(answer)
print(detailed_ans)
data['detailed_answer'] = detailed_ans

[(3503,), (2328.600000000004,), ('Rock', 1297), (25.86, 404), ('The Woman King', 3.98), ('Helena Holý', 49.620000000000005), ('Iron Maiden', 140), ('Fynn Zimmermann', 833.0400000000016), ('Music', 3290), ('Rock', 826.6500000000061), ('Overdose', 6), ('Steve Johnson', 5.715555555555564), ('Luis Rojas', 12), ('4', 5.911525423728816), ('Intoitus: Adorate Deum', 5), ('Steve Harris', 57.42000000000003)]


In [52]:
df = pd.DataFrame(data)
df

Unnamed: 0,question,sql_query,tables_needed,sql_query_soft_eval,answer_soft_eval,answer,detailed_answer
0,How many tracks are there in the database?,SELECT COUNT(*) FROM Track;,Track,SELECT COUNT(*) FROM Track;,"There are 3,503 tracks in the database.",3503,"(3503,)"
1,What is the total revenue from all invoices?,SELECT SUM(Total) FROM Invoice;,Invoice,SELECT SUM(Total) FROM Invoice;,Total revenue from all invoices is approximate...,2328.6,"(2328.600000000004,)"
2,What is the name of the most popular genre by ...,SELECT Name FROM Genre WHERE GenreId = (SELECT...,"Genre, Track","SELECT Genre.Name, COUNT(*) AS TrackCount FROM...","The most popular genre is Rock with 1,297 tracks.",Rock,"(Rock, 1297)"
3,What is the highest amount ever billed to a si...,SELECT MAX(Total) FROM Invoice;,Invoice,"SELECT MAX(Total) AS HighestTotal, InvoiceId F...",The highest invoice total is $25.86 for invoic...,25.86,"(25.86, 404)"
4,What is the name of the track that has generat...,SELECT T.Name FROM Track T JOIN InvoiceLine IL...,"Track, InvoiceLine","SELECT Track.Name, SUM(InvoiceLine.UnitPrice *...",The track 'The Woman King' generated the most ...,The Woman King,"(The Woman King, 3.98)"
5,Which customer has spent the most money in total?,SELECT FirstName|| ' ' ||LastName FROM Custome...,"Customer, Invoice",SELECT Customer.FirstName || ' ' || Customer.L...,"Helena Holý spent the most money, totaling app...",Helena Holý,"(Helena Holý, 49.620000000000005)"
6,Which artist's tracks are the most purchased?,SELECT A.Name FROM Artist A JOIN Album Al ON A...,"Artist, Album, Track, InvoiceLine","SELECT Artist.Name, COUNT(*) AS TotalPurchases...",Iron Maiden's tracks were purchased 140 times.,Iron Maiden,"(Iron Maiden, 140)"
7,Which employee has generated the most revenue ...,SELECT E.FirstName|| ' ' ||E.LastName FROM Emp...,"Employee, Customer, Invoice",SELECT Customer.FirstName || ' ' || Customer.L...,Fynn Zimmermann generated the most revenue at ...,Jane Peacock,"(Fynn Zimmermann, 833.0400000000016)"
8,What is the name of the most popular playlist ...,SELECT P.Name FROM Playlist P JOIN PlaylistTra...,"Playlist, PlaylistTrack","SELECT Playlist.Name, COUNT(*) AS TrackCount F...","The 'Music' playlist contains 3,290 tracks.",Music,"(Music, 3290)"
9,Which genre has generated the highest total re...,SELECT G.Name FROM Genre G JOIN Track T ON G.G...,"Genre, Track, InvoiceLine","SELECT Genre.Name, SUM(InvoiceLine.UnitPrice *...",Rock generated the highest total revenue at ap...,Rock,"(Rock, 826.6500000000061)"


In [53]:
df.to_csv("../data/evaluation_dataset.csv", index=False)