In [5]:
import pandas as pd
from sqlalchemy import create_engine
from mysql.connector import connect
import os

In [3]:
connection = connect(user=os.environ['mysqluser'], password=os.environ['mysqlpass'], host=os.environ['mysqlhost'])

In [4]:
cursor = connection.cursor()
query = ("SHOW DATABASES")
cursor.execute(query)
print(cursor.fetchall())

[('belajar_mysql',), ('information_schema',), ('mysql',), ('new_schema',), ('ntx-de',), ('performance_schema',), ('sys',), ('tinkerlust',), ('w3resource',)]


In [9]:
engine_uri = f"mysql+pymysql://{os.environ['mysqluser']}:{os.environ['mysqlpass']}@{os.environ['mysqlhost']}/{'ntx-de'}"
connection = create_engine(engine_uri)

In [10]:
query = ("SELECT * FROM ecommerce ")
df = pd.read_sql(query, connection)
print(df.head())

         fullVisitorId channelGrouping     time        country  \
0  2515546493837534633  Organic Search   966564         Taiwan   
1  7313828956068851679        Referral   228279  United States   
2  6036794406403793540  Organic Search  1615618  United States   
3  7847280609739507227  Organic Search    37832         Canada   
4  1371078687799372953        Referral   478901  United States   

                            city totalTransactionRevenue transactions  \
0                      (not set)                                        
1                  San Francisco                                        
2                        Boulder                                        
3  not available in demo dataset                                        
4                      Ann Arbor                67570000            1   

   timeOnSite  pageviews sessionQualityDim  ...  itemQuantity  itemRevenue  \
0        1567         82                17  ...                              
1       

### Identify top products based on the total transaction revenue per day.

prompt : write me sql query to Identify top products based on the total transaction revenue per day. the column you need consider is  date, productSKU, totalTransactionRevenue 

respons : 
SELECT date, productSKU, SUM(totalTransactionRevenue) AS TotalRevenue
FROM yourTableName
GROUP BY date, productSKU
ORDER BY date, TotalRevenue DESC;

prompt : why the result not show the top sum of totalTransactionRevenue per day/date?
answer : if you need to retrieve only the top products for each day based on the total revenue, you can use a query that ranks the products for each day. Depending on the database system you're using, the exact query might vary.

For instance, using window function for ranking (if supported), the query might look something like this for databases such as PostgreSQL or SQL Server:

SELECT date, productSKU, TotalRevenue
FROM (SELECT date, productSKU, SUM(totalTransactionRevenue) AS TotalRevenue,
           RANK() OVER(PARTITION BY date ORDER BY SUM(totalTransactionRevenue) DESC) AS revenueRank
    FROM ecommerce
    GROUP BY date, productSKU)
WHERE revenueRank = 1
ORDER BY date;


improvement :
1. Use a CTE instead of a subquery. This is a matter of style, but CTEs are generally considered to be more readable and maintainable.
2. Replace RANK() with ROW_NUMBER() because RANK() will display all rows with the same target value in the same partition.
3. Add a comment to the query to explain what it does.
4. Indentation and formatting: Proper indentation and formatting can make the query easier to read.

WITH RankedProducts AS (
    SELECT date, productSKU, SUM(totalTransactionRevenue) AS TotalRevenue,
           RANK() OVER(PARTITION BY date ORDER BY SUM(totalTransactionRevenue) DESC) AS revenueRank
    FROM yourTableName
    GROUP BY date, productSKU
)
SELECT date, productSKU, TotalRevenue
FROM ecommerce
WHERE revenueRank = 1;

In [30]:
query = ('''WITH RankedProducts AS (
        -- Calculate the total revenue for each product on a specific date and rank them
        SELECT 
            date, 
            productSKU, 
            SUM(totalTransactionRevenue) AS TotalRevenue,
            ROW_NUMBER() OVER(PARTITION BY date ORDER BY SUM(totalTransactionRevenue) DESC) AS revenueRank
        FROM 
            ecommerce
        GROUP BY 
            date, productSKU
        )
        -- Select the products with the highest revenue for each date
        SELECT 
            date, 
            productSKU, 
            TotalRevenue
        FROM 
            RankedProducts
        WHERE 
            revenueRank = 1;''')
df = pd.read_sql(query, connection)
print(df.head(10))

       date      productSKU  TotalRevenue
0  20160801  GGOEGBJR018199  1.447900e+08
1  20160802  GGOEGBJR018199  1.912000e+08
2  20160803    GGOEGAAX0037  0.000000e+00
3  20160804  GGOEGBCR024399  1.138500e+08
4  20160805  GGOEGBJR018199  2.000600e+08
5  20160806  GGOEGBMC056599  1.036100e+09
6  20160807  GGOEGBJR018199  1.405900e+08
7  20160808  GGOEGBJC019999  1.840000e+08
8  20160809  GGOEGBJL013999  4.019200e+08
9  20160810  GGOEGBJL013999  6.575100e+08


### Detect any anomalies, such as a sharp decrease or increase in the number of transactions for a specific product.

### Identify the most profitable city or province based on the total transaction revenue.

prompt : write me sql query to Identify the most profitable city or province based on the total transaction revenue.

respons : Certainly! To identify the most profitable city or province based on the total transaction revenue, you'd need to have a table containing transaction data with information such as city, province, and revenue. Here's an example SQL query assuming you have a table named transactions:

SELECT 
    city,
    province,
    SUM(revenue) AS total_revenue
FROM 
    transactions
GROUP BY 
    city, province
ORDER BY 
    total_revenue DESC
LIMIT 1;

but the result is 

Row	| city                         |   total_revenue

1	| not available in demo dataset |  240550358070000


so we need to filter that word.

improvement :
1. filter the 'not available in demo dataset' word


In [34]:
query = ('''
        SELECT 
            city,
            SUM(totalTransactionRevenue) AS total_revenue
        FROM 
            ecommerce
        WHERE city<>'not available in demo dataset'
        GROUP BY 
            city
        ORDER BY 
            total_revenue DESC
        LIMIT 5;
''')
df = pd.read_sql(query, connection)
print(df)

            city  total_revenue
0       New York   1.690465e+10
1  San Francisco   6.650790e+09
2  Mountain View   5.506000e+09
3        Toronto   5.427980e+09
4        Chicago   4.539850e+09
