# Import Library

In [2]:
import pandas as pd
import sqlite3
import requests
import io

conn = sqlite3.connect("./data/chinook.db")

In [326]:
country = pd.read_sql_query(
    """
    SELECT customers.Country FROM customers
    """, conn)
country["Country"] = pd.DataFrame(country["Country"].unique())
country.dropna()

Unnamed: 0,Country
0,Brazil
1,Germany
2,Canada
3,Norway
4,Czech Republic
5,Austria
6,Belgium
7,Denmark
8,USA
9,Portugal


In [3]:
chinook = pd.read_sql_query(
    """
    SELECT (C.FirstName||' '||C.LastName) as FullName , I.BillingCountry as Country, I.InvoiceDate, I.Total
    FROM customers as C
    LEFT JOIN invoices as I
    ON C.CustomerId = I.CustomerId
    """, conn, parse_dates="InvoiceDate")

chinook.pivot_table(index="FullName", values="Total", aggfunc="sum").sort_values("Total", ascending=False).head()

Unnamed: 0_level_0,Total
FullName,Unnamed: 1_level_1
Helena Holý,49.62
Richard Cunningham,47.62
Luis Rojas,46.62
Ladislav Kovács,45.62
Hugh O'Reilly,45.62


In [334]:
albums = pd.read_sql_query(
        """
            SELECT 
            albums.AlbumId, albums.Title as Album, artists.Name as Artist, tracks.Composer, genres.Name as Genre, tracks.UnitPrice
            FROM tracks
            LEFT JOIN albums ON albums.AlbumId = tracks.AlbumId
            LEFT JOIN artists ON artists.ArtistId = albums.AlbumId
            LEFT JOIN genres ON genres.GenreId = tracks.GenreId
        """
        , conn, index_col="AlbumId")
albums.drop_duplicates(subset=["Album"], keep="first")

Unnamed: 0_level_0,Album,Artist,Composer,Genre,UnitPrice
AlbumId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,For Those About To Rock We Salute You,AC/DC,"Angus Young, Malcolm Young, Brian Johnson",Rock,0.99
2,Balls to the Wall,Accept,,Rock,0.99
3,Restless and Wild,Aerosmith,"F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho...",Rock,0.99
4,Let There Be Rock,Alanis Morissette,AC/DC,Rock,0.99
5,Big Ones,Alice In Chains,"Steven Tyler, Joe Perry, Jack Blades, Tommy Shaw",Rock,0.99
...,...,...,...,...,...
343,Respighi:Pines of Rome,,,Classical,0.99
344,Schubert: The Late String Quartets & String Qu...,,Franz Schubert,Classical,0.99
345,Monteverdi: L'Orfeo,,Claudio Monteverdi,Classical,0.99
346,Mozart: Chamber Music,,Wolfgang Amadeus Mozart,Classical,0.99


## Fetch Song Tracks and Convert to CSV file

- [X] Take data from joining minimum of 4 table

In [4]:
tracks = pd.read_sql_query(
 '''
 SELECT 
 tracks.TrackId, tracks.Name as Song, artists.Name as Artist, albums.Title as Album, tracks.Composer, genres.Name as Genre, tracks.UnitPrice
 FROM tracks
 LEFT JOIN albums ON albums.AlbumId = tracks.AlbumId
 LEFT JOIN artists ON artists.ArtistId = albums.AlbumId
 LEFT JOIN genres ON genres.GenreId = tracks.GenreId
 ''', conn)

tracks.to_csv("./data/tracks.csv", index=False)
tracks.head()

Unnamed: 0,TrackId,Song,Artist,Album,Composer,Genre,UnitPrice
0,1,For Those About To Rock (We Salute You),AC/DC,For Those About To Rock We Salute You,"Angus Young, Malcolm Young, Brian Johnson",Rock,0.99
1,2,Balls to the Wall,Accept,Balls to the Wall,,Rock,0.99
2,3,Fast As a Shark,Aerosmith,Restless and Wild,"F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho...",Rock,0.99
3,4,Restless and Wild,Aerosmith,Restless and Wild,"F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. D...",Rock,0.99
4,5,Princess of the Dawn,Aerosmith,Restless and Wild,Deaffy & R.A. Smith-Diesel,Rock,0.99


## Top 5 Country

- [X] Datetime operation
- [X] Categorical operation
- [X] Frequencies analysis
- [ ] Missing Value and Duplicates operation

In [5]:
customers = pd.read_sql_query(
    """
    SELECT (C.FirstName||' '||C.LastName) as FullName , I.BillingCountry as Country, I.InvoiceDate, I.Total
    FROM customers as C
    LEFT JOIN invoices as I
    ON C.CustomerId = I.CustomerId
    """, conn, parse_dates="InvoiceDate")

# customers.pivot_table(index="FullName", values="Total", aggfunc="sum").sort_values("Total", ascending=False).head()
customers



Unnamed: 0,FullName,Country,InvoiceDate,Total
0,Luís Gonçalves,Brazil,2010-03-11,3.98
1,Luís Gonçalves,Brazil,2010-06-13,3.96
2,Luís Gonçalves,Brazil,2010-09-15,5.94
3,Luís Gonçalves,Brazil,2011-05-06,0.99
4,Luís Gonçalves,Brazil,2012-10-27,1.98
...,...,...,...,...
407,Puja Srivastava,India,2009-07-08,5.94
408,Puja Srivastava,India,2010-02-26,1.99
409,Puja Srivastava,India,2011-08-20,1.98
410,Puja Srivastava,India,2011-09-30,13.86


In [6]:
top5 = customers.groupby('Country').Total.sum().sort_values(ascending=False).head().index.to_list()
top5_data = customers[customers['Country'].isin(top5)].copy()

dayorder = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']


In [7]:
top5_data['InvoiceDOW'] = top5_data['InvoiceDate'].dt.day_name()
top5_data['InvoiceDOW'] = pd.Categorical(top5_data['InvoiceDOW'],
                                         categories=dayorder,
                                         ordered=True)
top5_data

Unnamed: 0,FullName,Country,InvoiceDate,Total,InvoiceDOW
0,Luís Gonçalves,Brazil,2010-03-11,3.98,Thursday
1,Luís Gonçalves,Brazil,2010-06-13,3.96,Sunday
2,Luís Gonçalves,Brazil,2010-09-15,5.94,Wednesday
3,Luís Gonçalves,Brazil,2011-05-06,0.99,Friday
4,Luís Gonçalves,Brazil,2012-10-27,1.98,Saturday
...,...,...,...,...,...
296,Isabelle Mercier,France,2010-07-15,5.94,Thursday
297,Isabelle Mercier,France,2011-03-05,0.99,Saturday
298,Isabelle Mercier,France,2012-08-26,1.98,Sunday
299,Isabelle Mercier,France,2012-10-06,16.86,Saturday


In [8]:
albums = pd.read_sql_query(
    """
     SELECT 
     albums.AlbumId, albums.Title as Album, artists.Name as Artist, tracks.Composer, genres.Name as Genre, tracks.UnitPrice
     FROM tracks
     LEFT JOIN albums ON albums.AlbumId = tracks.AlbumId
     LEFT JOIN artists ON artists.ArtistId = albums.AlbumId
     LEFT JOIN genres ON genres.GenreId = tracks.GenreId
    """, conn)

albums[albums["Artist"] != "None"]

Unnamed: 0,AlbumId,Album,Artist,Composer,Genre,UnitPrice
0,1,For Those About To Rock We Salute You,AC/DC,"Angus Young, Malcolm Young, Brian Johnson",Rock,0.99
1,2,Balls to the Wall,Accept,,Rock,0.99
2,3,Restless and Wild,Aerosmith,"F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho...",Rock,0.99
3,3,Restless and Wild,Aerosmith,"F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. D...",Rock,0.99
4,3,Restless and Wild,Aerosmith,Deaffy & R.A. Smith-Diesel,Rock,0.99
...,...,...,...,...,...,...
3498,343,Respighi:Pines of Rome,,,Classical,0.99
3499,344,Schubert: The Late String Quartets & String Qu...,,Franz Schubert,Classical,0.99
3500,345,Monteverdi: L'Orfeo,,Claudio Monteverdi,Classical,0.99
3501,346,Mozart: Chamber Music,,Wolfgang Amadeus Mozart,Classical,0.99


In [305]:
top_albums = pd.read_sql_query(
        """
            SELECT 
            albums.AlbumId, albums.Title as Album, artists.Name as Artist, tracks.Composer, invoices.BillingCountry as Country,
            genres.Name as Genre, tracks.UnitPrice, invoices.Total
            FROM tracks
            LEFT JOIN albums ON albums.AlbumId = tracks.AlbumId
            LEFT JOIN artists ON artists.ArtistId = albums.AlbumId
            LEFT JOIN genres ON genres.GenreId = tracks.GenreId
            LEFT JOIN invoice_items ON invoice_items.InvoiceLineId = tracks.TrackId
            LEFT JOIN invoices ON invoices.InvoiceId = invoice_items.InvoiceLineId
        """, conn)
# is_null = top_albums.loc[:,["Total"]].isnull().fillna(0)
# is_null
# top_albums[is_null].fillna(0, inplace=True)
# top_albums
# top_albums.loc[:,["Total"]] = top_albums.loc[:,["Total"]].isnull().fillna(value="fsdf")
top_albums[["Country", "Genre"]] = top_albums[["Country", "Genre"]].astype("category", errors="raise")
top_albums["Country"].str.lower()
# top_albums
# top_albums

0       germany
1        norway
2       belgium
3        canada
4           usa
         ...   
3498        NaN
3499        NaN
3500        NaN
3501        NaN
3502        NaN
Name: Country, Length: 3503, dtype: object

In [313]:
top_albums = top_albums.groupby(["Country", "Album"])[["Total"]].agg("count").sort_values("Total", ascending=False).reset_index([0,1])
# ct.loc[["USA"],:]
# top_albums.melt(id_vars=["Country", "Album"], value_vars="Total", value_name="Total").drop(columns="variable")
# ct
# ct[ct["Country"] == "USA"]
# top_albums.reset_index(level="Country")
top_albums[top_albums["Country"] == "Argentina"]

Unnamed: 0,Country,Album,Total
0,Argentina,...And Justice For All,1
521,Argentina,20th Century Masters - The Millennium Collecti...,1
5217,Argentina,[1997] Black Light Syndrome,1
5218,Argentina,Zooropa,1
5219,Argentina,Worlds,1
...,...,...,...
5720,Argentina,Live On Two Legs [Live],1
5721,Argentina,Live At Donington 1992 (Disc 2),1
5722,Argentina,Live At Donington 1992 (Disc 1),1
5723,Argentina,Live After Death,1


In [87]:
invoice_total = pd.read_sql_query(
                """
                    SELECT InvoiceId, InvoiceDate, (customers.FirstName||' '||customers.LastName) as CustomerName, Country, City, Total
                    FROM invoices
                    LEFT JOIN customers ON customers.CustomerID = invoices.CustomerID
                """, conn)
invoice_total['InvoiceDate'] = pd.to_datetime(invoice_total['InvoiceDate'])
year_order = ["2009", "2010", "2011", "2012", "2013"]
invoice_total['Year'] = invoice_total['InvoiceDate'].dt.year

In [88]:

invoice_total['Year'] = pd.Categorical(invoice_total['Year'], categories=year_order)
invoice_total['Year'] = invoice_total['InvoiceDate'].dt.year
invoice_total = invoice_total.groupby(by=["InvoiceDate", "CustomerName", "Country", "Year"])["Total"].agg("sum").reset_index()

invoice_total.head()

Unnamed: 0,InvoiceDate,CustomerName,Country,Year,Total
0,2009-01-01,Leonie Köhler,Germany,2009,1.98
1,2009-01-02,Bjørn Hansen,Norway,2009,3.96
2,2009-01-03,Daan Peeters,Belgium,2009,5.94
3,2009-01-06,Mark Philips,Canada,2009,8.91
4,2009-01-11,John Gordon,USA,2009,13.86


In [86]:
mask_year = invoice_total["Year"] == 2010
inv_year = invoice_total[mask_year].melt(id_vars=["Year", "CustomerName"], value_vars="Total", value_name="Total").drop(columns="variable")
inv_year

Unnamed: 0,Year,CustomerName,Total
0,2010,Isabelle Mercier,1.98
1,2010,Ladislav Kovács,1.98
2,2010,Lucas Mancini,3.96
3,2010,Joakim Johansson,6.94
4,2010,Luis Rojas,17.91
...,...,...,...
78,2010,Johannes Van der Berg,1.98
79,2010,Emma Jones,3.96
80,2010,Diego Gutiérrez,5.94
81,2010,François Tremblay,8.91


In [10]:
# invoice_total['Year'].unique()

In [44]:

top_customer = pd.read_sql_query(
                """
                   SELECT
                   artists.ArtistId, artists.Name as ArtistName, albums.AlbumId, albums.Title as albumTitle,
                   customers.CustomerId
                   COUNT(customers.CustomerId) AS TotalInvoice
                   FROM invoice_items
                   LEFT JOIN invoices on invoices.InvoiceId = invoice_items.InvoiceId
                   LEFT JOIN customers on customers.CustomerId = invoices.CustomerId
                   LEFT JOIN tracks on tracks.TrackId = invoice_items.TrackId
                   LEFT JOIN albums on albums.AlbumId = tracks.AlbumId
                   LEFT JOIN artists on artists.ArtistId = albums.ArtistId
                   GROUP BY artists.ArtistId
                   ORDER BY TotalInvoice DESC
               """, conn )
# top_customer.head()

DatabaseError: Execution failed on sql '
                   SELECT
                   artists.ArtistId, artists.Name as ArtistName, albums.AlbumId, albums.Title as albumTitle,
                   customers.CustomerId
                   COUNT(customers.CustomerId) AS TotalInvoice
                   FROM invoice_items
                   LEFT JOIN invoices on invoices.InvoiceId = invoice_items.InvoiceId
                   LEFT JOIN customers on customers.CustomerId = invoices.CustomerId
                   LEFT JOIN tracks on tracks.TrackId = invoice_items.TrackId
                   LEFT JOIN albums on albums.AlbumId = tracks.AlbumId
                   LEFT JOIN artists on artists.ArtistId = albums.ArtistId
                   GROUP BY artists.ArtistId
                   ORDER BY TotalInvoice DESC
               ': near "(": syntax error

In [11]:
# book_url = 'https://algo-capstone.herokuapp.com/data/get/books_c.csv'
# s = requests.get(book_url)
# s_df = pd.DataFrame(s.json())
# s_df

In [12]:
pd.read_csv('./data/pulsar_stars.csv').head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [13]:
# pd.read_csv('https://algo-capstone.herokuapp.com/data/get/equal/books_c.csv/isbn/0439785960').head()

In [94]:
# heroku_url = 'https://algo-capstone.herokuapp.com/data/get/books_c.csv'
# theurl = 'https://algoritma-api-capstone.herokuapp.com/'
# r = requests.get(theurl)
# r_pd = pd.DataFrame(r.json())
# r_pd.head()


url2 = 'https://algoritma-api-capstone.herokuapp.com/invoice/total/2010'
r = requests.get(url2)
r_pd = pd.DataFrame(r.json())
r_pd.head()

Unnamed: 0,InvoiceDate,Country,City,Total,year
83,1262908800000,France,Dijon,1.98,2010
84,1262908800000,Hungary,Budapest,1.98,2010
85,1262995200000,Italy,Rome,3.96,2010
86,1263081600000,Sweden,Stockholm,6.94,2010
87,1263340800000,Chile,Santiago,17.91,2010
