# Data Analysis in Spark

In [5]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from ts.flint import FlintContext, summarizers 

In [6]:
spark = SparkContext("local", "sqlContext")
sql = SQLContext(sc)

In [9]:
df = sql.read.csv('D:/data/csv/vgsales.csv',inferSchema=True, header=True)
df.show()

+----+--------------------+--------+----+------------+--------------------+--------+--------+--------+-----------+------------+
|Rank|                Name|Platform|Year|       Genre|           Publisher|NA_Sales|EU_Sales|JP_Sales|Other_Sales|Global_Sales|
+----+--------------------+--------+----+------------+--------------------+--------+--------+--------+-----------+------------+
|   1|          Wii Sports|     Wii|2006|      Sports|            Nintendo|   41.49|   29.02|    3.77|       8.46|       82.74|
|   2|   Super Mario Bros.|     NES|1985|    Platform|            Nintendo|   29.08|    3.58|    6.81|       0.77|       40.24|
|   3|      Mario Kart Wii|     Wii|2008|      Racing|            Nintendo|   15.85|   12.88|    3.79|       3.31|       35.82|
|   4|   Wii Sports Resort|     Wii|2009|      Sports|            Nintendo|   15.75|   11.01|    3.28|       2.96|        33.0|
|   5|Pokemon Red/Pokem...|      GB|1996|Role-Playing|            Nintendo|   11.27|    8.89|   10.22|  

#### Shape

In [22]:
def dfShape(df):
    return df.count(), len(df.columns)

dfShape(df)

(16598, 11)

#### Head & schema

In [20]:
df.show(5)

+----+--------------------+--------+----+------------+---------+--------+--------+--------+-----------+------------+
|Rank|                Name|Platform|Year|       Genre|Publisher|NA_Sales|EU_Sales|JP_Sales|Other_Sales|Global_Sales|
+----+--------------------+--------+----+------------+---------+--------+--------+--------+-----------+------------+
|   1|          Wii Sports|     Wii|2006|      Sports| Nintendo|   41.49|   29.02|    3.77|       8.46|       82.74|
|   2|   Super Mario Bros.|     NES|1985|    Platform| Nintendo|   29.08|    3.58|    6.81|       0.77|       40.24|
|   3|      Mario Kart Wii|     Wii|2008|      Racing| Nintendo|   15.85|   12.88|    3.79|       3.31|       35.82|
|   4|   Wii Sports Resort|     Wii|2009|      Sports| Nintendo|   15.75|   11.01|    3.28|       2.96|        33.0|
|   5|Pokemon Red/Pokem...|      GB|1996|Role-Playing| Nintendo|   11.27|    8.89|   10.22|        1.0|       31.37|
+----+--------------------+--------+----+------------+---------+

In [21]:
df.toPandas()

AttributeError: 'NoneType' object has no attribute 'setCallSite'

In [14]:
df.printSchema()

root
 |-- Rank: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- NA_Sales: double (nullable = true)
 |-- EU_Sales: double (nullable = true)
 |-- JP_Sales: double (nullable = true)
 |-- Other_Sales: double (nullable = true)
 |-- Global_Sales: double (nullable = true)



#### Statistics

In [19]:
df.describe(['JP_Sales','Global_Sales']).show()

+-------+-------------------+------------------+
|summary|           JP_Sales|      Global_Sales|
+-------+-------------------+------------------+
|  count|              16598|             16598|
|   mean|0.07778166044101108|  0.53744065550074|
| stddev| 0.3092906480822022|1.5550279355699066|
|    min|                0.0|              0.01|
|    max|              10.22|             82.74|
+-------+-------------------+------------------+



#### Filtering

In [18]:
df.select('Name','Genre','JP_Sales').show(15, truncate=False)

+---------------------------+------------+--------+
|Name                       |Genre       |JP_Sales|
+---------------------------+------------+--------+
|Wii Sports                 |Sports      |3.77    |
|Super Mario Bros.          |Platform    |6.81    |
|Mario Kart Wii             |Racing      |3.79    |
|Wii Sports Resort          |Sports      |3.28    |
|Pokemon Red/Pokemon Blue   |Role-Playing|10.22   |
|Tetris                     |Puzzle      |4.22    |
|New Super Mario Bros.      |Platform    |6.5     |
|Wii Play                   |Misc        |2.93    |
|New Super Mario Bros. Wii  |Platform    |4.7     |
|Duck Hunt                  |Shooter     |0.28    |
|Nintendogs                 |Simulation  |1.93    |
|Mario Kart DS              |Racing      |4.13    |
|Pokemon Gold/Pokemon Silver|Role-Playing|7.2     |
|Wii Fit                    |Sports      |3.6     |
|Wii Fit Plus               |Sports      |2.53    |
+---------------------------+------------+--------+
only showing

In [20]:
df.groupBy("Platform").count().orderBy("count", ascending=False).show(10)

+--------+-----+
|Platform|count|
+--------+-----+
|      DS| 2163|
|     PS2| 2161|
|     PS3| 1329|
|     Wii| 1325|
|    X360| 1265|
|     PSP| 1213|
|      PS| 1196|
|      PC|  960|
|      XB|  824|
|     GBA|  822|
+--------+-----+
only showing top 10 rows



In [24]:
cond1 = (data.JP_Sales.isNotNull()) | (data.Global_Sales.isNotNull())
cond2 = data.Genre != 'Sports'
df = data.filter(cond1).filter(cond2)
df.show(20)

+----+--------------------+--------+----+------------+--------------------+--------+--------+--------+-----------+------------+
|Rank|                Name|Platform|Year|       Genre|           Publisher|NA_Sales|EU_Sales|JP_Sales|Other_Sales|Global_Sales|
+----+--------------------+--------+----+------------+--------------------+--------+--------+--------+-----------+------------+
|   2|   Super Mario Bros.|     NES|1985|    Platform|            Nintendo|   29.08|    3.58|    6.81|       0.77|       40.24|
|   3|      Mario Kart Wii|     Wii|2008|      Racing|            Nintendo|   15.85|   12.88|    3.79|       3.31|       35.82|
|   5|Pokemon Red/Pokem...|      GB|1996|Role-Playing|            Nintendo|   11.27|    8.89|   10.22|        1.0|       31.37|
|   6|              Tetris|      GB|1989|      Puzzle|            Nintendo|    23.2|    2.26|    4.22|       0.58|       30.26|
|   7|New Super Mario B...|      DS|2006|    Platform|            Nintendo|   11.38|    9.23|     6.5|  

#### SQL

In [29]:
sqc.registerDataFrameAsTable(df, "vg_sales")
sqc.tables().show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        | vg_sales|       true|
+--------+---------+-----------+



In [34]:
df2 = sqc.sql("SELECT vg_sales.Name, vg_sales.Genre, vg_sales.NA_Sales FROM vg_sales WHERE vg_sales.Genre='Action'")
#df2.collect()
df2.show()

+--------------------+------+--------+
|                Name| Genre|NA_Sales|
+--------------------+------+--------+
|  Grand Theft Auto V|Action|    7.01|
|Grand Theft Auto:...|Action|    9.43|
|  Grand Theft Auto V|Action|    9.63|
|Grand Theft Auto:...|Action|    8.41|
|Grand Theft Auto III|Action|    6.99|
|  Grand Theft Auto V|Action|     3.8|
|Pokemon HeartGold...|Action|     4.4|
| Grand Theft Auto IV|Action|    6.76|
| Grand Theft Auto IV|Action|    4.76|
|      FIFA Soccer 13|Action|    1.06|
|Grand Theft Auto:...|Action|     2.9|
|The Legend of Zel...|Action|     4.1|
|The Legend of Zel...|Action|    3.83|
|Uncharted 3: Drak...|Action|    2.77|
|Uncharted 2: Amon...|Action|    3.27|
| Red Dead Redemption|Action|    2.79|
| The Legend of Zelda|Action|    3.74|
|Assassin's Creed III|Action|    2.64|
| Red Dead Redemption|Action|     3.7|
|              Driver|Action|    3.11|
+--------------------+------+--------+
only showing top 20 rows



In [4]:
sc.stop()

## Credits & Links

https://towardsdatascience.com/a-neanderthals-guide-to-apache-spark-in-python-9ef1f156d427