# Welcome to Dataskillet!

In [2]:
import os
import modin.pandas as pd

Lets download some csvs

In [3]:
os.mkdir('testdrive_csvs')

In [6]:
!curl https://raw.githubusercontent.com/jasonchang0/kaggle-google-apps/master/google-play-store-apps/googleplaystore.csv -o testdrive_csvs/googleplaystore.csv
!curl https://raw.githubusercontent.com/jasonchang0/kaggle-google-apps/master/google-play-store-apps/googleplaystore_user_reviews.csv -o testdrive_csvs/googleplaystore_user_reviews.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1328k  100 1328k    0     0  3432k      0 --:--:-- --:--:-- --:--:-- 3432k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 7489k  100 7489k    0     0  12.3M      0 --:--:-- --:--:-- --:--:-- 12.3M


# The dataset contains google play apps and their user reviews about them

In [7]:
googleplaystore = pd.read_csv('testdrive_csvs/googleplaystore.csv')
googleplaystore.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [8]:
googleplaystore_user_reviews = pd.read_csv('testdrive_csvs/googleplaystore_user_reviews.csv')
googleplaystore_user_reviews.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


# Creating the DataSource

DataSource is the analogus of database. It stores information about tables and accepts queries to them.

On creation, we need to specify a directory to store table metadata for the DataSource.

In [9]:
from dataskillet import DataSource

In [10]:
metadata_dir = 'testdrive_metadata'
os.mkdir(metadata_dir)

In [11]:
ds = DataSource(metadata_dir=metadata_dir)

The metadata dir now stores information about tables. However we currently have no tables. Lets create some

In [13]:
ds.query('SHOW TABLES')



Unnamed: 0,name,fpath


# Creating tables

A `Table` is an abstraction over a dataframe. It loads the dataframe on-demand. On creation it applies simple preprocessings. The preprocessings are created once and stored in metadata, so they are always performed the same way when a dataframe is loaded.

In [19]:
ds.query(f'CREATE TABLE ("testdrive_csvs/googleplaystore.csv")')

Exception: Table googleplaystore already exists in data source, use DROP TABLE to remove it if you want to recreate it.

In [15]:
ds.query(f'CREATE TABLE ("testdrive_csvs/googleplaystore_user_reviews.csv")')

'OK'

In [16]:
ds.query('SHOW TABLES')



Unnamed: 0,name,fpath
0,googleplaystore,testdrive_csvs/googleplaystore.csv
1,googleplaystore_user_reviews,testdrive_csvs/googleplaystore_user_reviews.csv


Now that we have some tables, the information about them is stored in metadata. 

If we recreate the datasource using the same `metadata_dir`, we don't need to add the tables again.

In [17]:
ds = DataSource(metadata_dir=metadata_dir)

In [18]:
ds.query('SHOW TABLES')



Unnamed: 0,name,fpath
0,googleplaystore,testdrive_csvs/googleplaystore.csv
1,googleplaystore_user_reviews,testdrive_csvs/googleplaystore_user_reviews.csv


If we need it, we can clear the metadata and recreate the DataSource using `DataSource.create_new(metadata_dir)`

# Querying

In [20]:
ds.query('SELECT * FROM googleplaystore')

{'SelectStmt': {'targetList': [{'ResTarget': {'val': {'ColumnRef': {'fields': [{'A_Star': {}}], 'location': 7}}, 'location': 7}}], 'fromClause': [{'RangeVar': {'relname': 'googleplaystore', 'inh': True, 'relpersistence': 'p', 'location': 14}}], 'op': 0}}




Unnamed: 0,app,category,rating,reviews,size,installs,type,price,content_rating,genres,last_updated,current_ver,android_ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In [30]:
ds.query('SELECT app FROM googleplaystore WHERE price = 0')

{'SelectStmt': {'targetList': [{'ResTarget': {'val': {'ColumnRef': {'fields': [{'String': {'str': 'app'}}], 'location': 7}}, 'location': 7}}], 'fromClause': [{'RangeVar': {'relname': 'googleplaystore', 'inh': True, 'relpersistence': 'p', 'location': 16}}], 'whereClause': {'A_Expr': {'kind': 0, 'name': [{'String': {'str': '='}}], 'lexpr': {'ColumnRef': {'fields': [{'String': {'str': 'price'}}], 'location': 38}}, 'rexpr': {'A_Const': {'val': {'Integer': {'ival': 0}}, 'location': 46}}, 'location': 44}}, 'op': 0}}


AttributeError: type object 'DataFrame' has no attribute '__constructor__'

In [33]:
ds.query("SELECT * FROM googleplaystore WHERE category = 'FAMILY' AND price = 0")

{'SelectStmt': {'targetList': [{'ResTarget': {'val': {'ColumnRef': {'fields': [{'A_Star': {}}], 'location': 7}}, 'location': 7}}], 'fromClause': [{'RangeVar': {'relname': 'googleplaystore', 'inh': True, 'relpersistence': 'p', 'location': 14}}], 'whereClause': {'BoolExpr': {'boolop': 0, 'args': [{'A_Expr': {'kind': 0, 'name': [{'String': {'str': '='}}], 'lexpr': {'ColumnRef': {'fields': [{'String': {'str': 'category'}}], 'location': 36}}, 'rexpr': {'A_Const': {'val': {'String': {'str': 'FAMILY'}}, 'location': 47}}, 'location': 45}}, {'A_Expr': {'kind': 0, 'name': [{'String': {'str': '='}}], 'lexpr': {'ColumnRef': {'fields': [{'String': {'str': 'price'}}], 'location': 60}}, 'rexpr': {'A_Const': {'val': {'Integer': {'ival': 0}}, 'location': 68}}, 'location': 66}}], 'location': 56}}, 'op': 0}}


Exception: Unsupported operation: AND

In [34]:
ds.query("SELECT * FROM googleplaystore INNER JOIN googleplaystore_user_reviews ON googleplaystore.app = googleplaystore_user_reviews.app LIMIT 10")

{'SelectStmt': {'targetList': [{'ResTarget': {'val': {'ColumnRef': {'fields': [{'A_Star': {}}], 'location': 7}}, 'location': 7}}], 'fromClause': [{'JoinExpr': {'jointype': 0, 'larg': {'RangeVar': {'relname': 'googleplaystore', 'inh': True, 'relpersistence': 'p', 'location': 14}}, 'rarg': {'RangeVar': {'relname': 'googleplaystore_user_reviews', 'inh': True, 'relpersistence': 'p', 'location': 41}}, 'quals': {'A_Expr': {'kind': 0, 'name': [{'String': {'str': '='}}], 'lexpr': {'ColumnRef': {'fields': [{'String': {'str': 'googleplaystore'}}, {'String': {'str': 'app'}}], 'location': 73}}, 'rexpr': {'ColumnRef': {'fields': [{'String': {'str': 'googleplaystore_user_reviews'}}, {'String': {'str': 'app'}}], 'location': 95}}, 'location': 93}}}}], 'limitCount': {'A_Const': {'val': {'Integer': {'ival': 10}}, 'location': 134}}, 'op': 0}}


ValueError: labels ['app_y', 'app_x'] not contained in axis