In [1]:
import os

In [2]:
os.makedirs('export', exist_ok = True)

In [3]:
import gdown

gdown.download('https://drive.google.com/uc?id=1BqHzpcPg72yUJMpTzIEdQUs-TR2oIWaw', 'export/gp.zip', quiet = True)

'export/gp.zip'

In [4]:
import zipfile

with zipfile.ZipFile('export/gp.zip', 'r') as zip_ref:
    zip_ref.extractall('export')

In [5]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [6]:
os.environ['PYSPARK_PYTHON'] = 'python3'

In [7]:
spark = SparkSession.builder.appName('GooglePlay').getOrCreate()

In [8]:
df = spark.read.json('export/gp.jsonl')

In [9]:
df.printSchema

<bound method DataFrame.printSchema of DataFrame[author: string, author_link: string, compatibility: string, content_rating: string, description: string, developer_badge: string, developer_id: string, downloads: string, genre: string, item_name: string, link: string, physical_address: string, price: string, rating_value: string, review_number: string, updated: string, url: string, version: string, video_url: string]>

In [10]:
df.createOrReplaceTempView('apps')

In [11]:
best_photographies = spark.sql(
    '''SELECT item_name, CAST(rating_value as FLOAT) as rating FROM apps
    WHERE genre = 'Photography' AND CAST(rating_value as FLOAT) > 4.5
    ORDER BY rating
    '''
)

best_photographies.show()

+--------------------+------+
|           item_name|rating|
+--------------------+------+
|Photo Collage Editor|   4.6|
|แต่งรูป สติ๊กเกอร...|   4.6|
|FotoTool - Photog...|   4.6|
|Hoarding Pic Coll...|   4.6|
|Name Birthday Wishes|   4.6|
|Free Salento Sfon...|   4.6|
|Ramzyat HD - رمزي...|   4.6|
| Photo Effect Eraser|   4.6|
|         Photo Mixer|   4.6|
|         Love Frames|   4.6|
|        Jaguar Theme|   4.6|
|Bridal Suit Photo...|   4.6|
|PhotoMemo - Preci...|   4.6|
|Navratri Photo Fr...|   4.7|
|Women Day Photo C...|   4.7|
|New Year Photo Fr...|   4.7|
|    Photo To Art Pro|   4.7|
| Christmas Wallpaper|   4.7|
|      Rainbow Frames|   4.7|
|Hijab Beauty Wedding|   4.7|
+--------------------+------+
only showing top 20 rows



In [12]:
good_descriptions = spark.sql(
    '''SELECT item_name, description FROM apps
    WHERE LENGTH(description) > 100
    ORDER BY LENGTH(description)
    '''
)

good_descriptions.show()

+--------------------+-------------------------------------+
|           item_name|                          description|
+--------------------+-------------------------------------+
| Pregancy Calculator|                 This is a pregnan...|
|Elvis Presley Mus...|                 Songs,lyrics,phot...|
|    STB Smart Remote|可以通过手机连接并遥控机顶盒，手机...|
|      Test Pitagoras|                 Test de 20 pregun...|
|Ukrainian Fairy T...|                 Many generations ...|
|        Hair Clipper|                 This app allows y...|
|Percentage Calcul...|                 Basic Percentage ...|
|      通話ランキング|通話回数、通話時間単位でランキング...|
|     Revenue Monitor|                 beta versionCurre...|
|Secret Love Notif...|                 Help you, show yo...|
|   Lahourcade Basket|                 Appli officielle ...|
|Rádio Pentecostal...|                 Aplicativo da Rád...|
|       This is Ninja|                 Help Ninja to jum...|
|      Basketballaris|                 A site about Aris...|
|Cli

In [13]:
most_expansive = spark.sql(
    '''SELECT item_name, CAST(TRIM('£ Buy' FROM price) AS FLOAT) AS price FROM apps
    WHERE price <> 'Install'
    ORDER BY price DESC
    '''
)

most_expansive.show()

+----------------------------------------+------+
|                               item_name| price|
+----------------------------------------+------+
|                    All is Well; Be H...| 300.0|
|                                 DCKoin6|299.99|
|                    Apple and Ham Fla...|139.99|
|                             Robot Horse| 63.99|
|                              Calculator| 50.99|
|                               Skypier 7| 49.99|
|                    TRBOnet™ Bluetoot...| 47.99|
|                    ViperOne ProKey (...| 43.63|
|                    The White Belt Bible| 41.99|
|                    Corbin Champion U...| 40.99|
|                    Drugs in Anaesth&...| 34.99|
|                    Gujarati For Kids...| 31.33|
|                    Hungarian For Kid...| 30.95|
|                    Mini Sale for Min...| 27.99|
|                    French-Slovak Dic...| 26.99|
|                    Lernerfolg Grunds...| 23.99|
|                      吉祥易經卜卦正式版| 21.68|
|圣经．普通话聆听

In [14]:
best_downloads = df.select(
    F.col('description'),
    F.col('downloads')
).sort(
    F.regexp_extract(F.regexp_replace(F.col('downloads'), r',', ''), r'(\d+)', 1).cast('int'), ascending=False
)

best_downloads.show()

+--------------------+--------------------+
|         description|           downloads|
+--------------------+--------------------+
|Come join Talking...|100,000,000 - 500...|
|GO Keyboard - Goo...|100,000,000 - 500...|
|The Kindle app pu...|100,000,000 - 500...|
|GO Keyboard - Goo...|100,000,000 - 500...|
|Tired of the bori...|100,000,000 - 500...|
|Build secret word...|10,000,000 - 50,0...|
|Water Drop live w...|10,000,000 - 50,0...|
|Watch live video ...|10,000,000 - 50,0...|
|Go go go! You are...|10,000,000 - 50,0...|
|This is the perfe...|10,000,000 - 50,0...|
|Welcome to the va...|10,000,000 - 50,0...|
|Hourly weather fo...|10,000,000 - 50,0...|
|Do you like build...|10,000,000 - 50,0...|
|YAHTZEE® - the cl...|10,000,000 - 50,0...|
|Labyrinth is the ...|10,000,000 - 50,0...|
|• Expand your net...|10,000,000 - 50,0...|
|In this kitchen c...|10,000,000 - 50,0...|
|Santa Biblia Rein...|10,000,000 - 50,0...|
|"Dragon Mania Leg...|10,000,000 - 50,0...|
|• Effortlessly tr...|10,000,000