<h2 style='color:blue'>Loading libraries</h2>

In [19]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
from pyspark.sql.functions import*

<h2 style='color:blue'>Creating DataFrame</h2>

In [23]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("GooglePlayStoreAnalysis").getOrCreate()

In [25]:
spark

In [27]:
df=spark.read.load('googleplaystore.csv',format='csv',sep=',',header=True,escape='"',inferschema='true')

In [29]:
df.count()

10841

<h2 style='color:blue'>Checking Schema</h2>

In [31]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)



In [34]:
df.head()

Row(App='Photo Editor & Candy Camera & Grid & ScrapBook', Category='ART_AND_DESIGN', Rating=4.1, Reviews='159', Size='19M', Installs='10,000+', Type='Free', Price='0', Content Rating='Everyone', Genres='Art & Design', Last Updated='January 7, 2018', Current Ver='1.0.0', Android Ver='4.0.3 and up')

In [36]:
df1=df.drop("size","Content Rating","Last Updated","Android Ver")

In [38]:
df1.show()

+--------------------+--------------+------+-------+-----------+----+-----+--------------------+------------------+
|                 App|      Category|Rating|Reviews|   Installs|Type|Price|              Genres|       Current Ver|
+--------------------+--------------+------+-------+-----------+----+-----+--------------------+------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|    10,000+|Free|    0|        Art & Design|             1.0.0|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|   500,000+|Free|    0|Art & Design;Pret...|             2.0.0|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5,000,000+|Free|    0|        Art & Design|             1.2.4|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50,000,000+|Free|    0|        Art & Design|Varies with device|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|   100,000+|Free|    0|Art & Design;Crea...|               1.1|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|    50,000+|Free|   

<h2 style='color:blue'>Data Cleaning</h2>
<h3>Removing'+' sympol using Regular Expression<br><br>
Coverting Installs(string)to IntegerType<br><br>
Removing'$' sympol using regular Expression<br><br>
Coverting Price(string)to IntegerType</h3>

In [100]:
from pyspark.sql.functions import regexp_replace,col
df2=df1.withColumn("Reviews",col("Reviews").cast(IntegerType()))\
.withColumn("Installs",regexp_replace(col("Installs"),"[^0-9]",""))\
.withColumn("Installs",col("Installs").cast(IntegerType()))\
.withColumn("Price",regexp_replace(col("Price"),"[$]",""))\
.withColumn("Price",col("Price").cast(IntegerType()))



<h2 style='color:blue'>Data Analysis</h2>

In [72]:
df2.createOrReplaceTempView("view1")

In [74]:
r0=spark.sql("select*from view1")
r0.show()

+--------------------+--------------+------+-------+--------+----+-----+--------------------+------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|       Current Ver|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|    0|        Art & Design|             1.0.0|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|    0|Art & Design;Pret...|             2.0.0|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|    0|        Art & Design|             1.2.4|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|    0|        Art & Design|Varies with device|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|  100000|Free|    0|Art & Design;Crea...|               1.1|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|   50000|Free|    0|        Art & Design|   

<h3>Top 10 apps with high reviews</h3>

In [80]:
r1=spark.sql("select App,sum(Reviews) from view1 group by 1 order by 2 desc")
r1.show(10)

+--------------------+------------+
|                 App|sum(Reviews)|
+--------------------+------------+
|           Instagram|   266241989|
|  WhatsApp Messenger|   207348304|
|      Clash of Clans|   179558781|
|Messenger – Text ...|   169932272|
|      Subway Surfers|   166331958|
|    Candy Crush Saga|   156993136|
|            Facebook|   156286514|
|         8 Ball Pool|    99386198|
|        Clash Royale|    92530298|
|            Snapchat|    68045010|
+--------------------+------------+
only showing top 10 rows



<h3>Top 10 installed apps and distribution of type</h3>

In [87]:
r2=spark.sql("select App,Type,sum(Installs) from view1 group by 1,2 order by 3 desc")
r2.show(10)

+------------------+----+-------------+
|               App|Type|sum(Installs)|
+------------------+----+-------------+
|    Subway Surfers|Free|   6000000000|
|         Instagram|Free|   4000000000|
|      Google Drive|Free|   4000000000|
|          Hangouts|Free|   4000000000|
|     Google Photos|Free|   4000000000|
|       Google News|Free|   4000000000|
|  Candy Crush Saga|Free|   3500000000|
|WhatsApp Messenger|Free|   3000000000|
|             Gmail|Free|   3000000000|
|      Temple Run 2|Free|   3000000000|
+------------------+----+-------------+
only showing top 10 rows



<h3>Top 10 Paid Apps</h3>

In [93]:
r3=spark.sql("select App,sum(Price) from view1  where Type='Paid' group by 1 order by 2 desc")
r3.show(10)

+--------------------+----------+
|                 App|sum(Price)|
+--------------------+----------+
|I'm Rich - Trump ...|       400|
|most expensive ap...|       399|
|           I am Rich|       399|
|  I AM RICH PRO PLUS|       399|
|  I am rich(premium)|       399|
|      I am Rich Plus|       399|
|I'm Rich/Eu sou R...|       399|
|   I Am Rich Premium|       399|
|       I Am Rich Pro|       399|
|          I am Rich!|       399|
+--------------------+----------+
only showing top 10 rows

