In [1]:
# Installing required packages
!pip install pyspark
!pip install findspark
!pip install pandas

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 33 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 51.8 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=53f0264a0e5b952f232be3112382a0417def30ad67065d8685b3a4249fbf85a5
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [2]:
import findspark
findspark.init()

In [3]:
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [4]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [5]:
spark

Loading data into a Pandas Dataframe then into a Spark DataFrame

In [41]:
all_seasons = pd.read_csv('all_seasons.csv')
sdf = spark.createDataFrame(all_seasons) 

In [7]:
sdf.printSchema()

root
 |-- Unnamed: 0: long (nullable = true)
 |-- player_name: string (nullable = true)
 |-- team_abbreviation: string (nullable = true)
 |-- age: double (nullable = true)
 |-- player_height: double (nullable = true)
 |-- player_weight: double (nullable = true)
 |-- college: string (nullable = true)
 |-- country: string (nullable = true)
 |-- draft_year: string (nullable = true)
 |-- draft_round: string (nullable = true)
 |-- draft_number: string (nullable = true)
 |-- gp: long (nullable = true)
 |-- pts: double (nullable = true)
 |-- reb: double (nullable = true)
 |-- ast: double (nullable = true)
 |-- net_rating: double (nullable = true)
 |-- oreb_pct: double (nullable = true)
 |-- dreb_pct: double (nullable = true)
 |-- usg_pct: double (nullable = true)
 |-- ts_pct: double (nullable = true)
 |-- ast_pct: double (nullable = true)
 |-- season: string (nullable = true)



In [42]:
sdf.show(10)

+----------+-----------------+-----------------+----+-------------+-------------+--------------------+-------+----------+-----------+------------+---+----+----+---+----------+--------+------------------+------------------+------------------+-------+-------+
|Unnamed: 0|      player_name|team_abbreviation| age|player_height|player_weight|             college|country|draft_year|draft_round|draft_number| gp| pts| reb|ast|net_rating|oreb_pct|          dreb_pct|           usg_pct|            ts_pct|ast_pct| season|
+----------+-----------------+-----------------+----+-------------+-------------+--------------------+-------+----------+-----------+------------+---+----+----+---+----------+--------+------------------+------------------+------------------+-------+-------+
|         0|    Dennis Rodman|              CHI|36.0|       198.12|     99.79024|Southeastern Okla...|    USA|      1986|          2|          27| 55| 5.7|16.1|3.1|      16.1|   0.186|0.3229999999999999|               0.1|    

#### Remove all undrafted players

In [43]:
sdf1 = sdf.filter((sdf.draft_round != 'Undrafted') | (sdf.draft_number != 'Undrafted'))
sdf1.show(10)

+----------+-----------------+-----------------+----+-------------+-------------+--------------------+-------+----------+-----------+------------+---+----+----+---+----------+--------+------------------+------------------+------------------+-------+-------+
|Unnamed: 0|      player_name|team_abbreviation| age|player_height|player_weight|             college|country|draft_year|draft_round|draft_number| gp| pts| reb|ast|net_rating|oreb_pct|          dreb_pct|           usg_pct|            ts_pct|ast_pct| season|
+----------+-----------------+-----------------+----+-------------+-------------+--------------------+-------+----------+-----------+------------+---+----+----+---+----------+--------+------------------+------------------+------------------+-------+-------+
|         0|    Dennis Rodman|              CHI|36.0|       198.12|     99.79024|Southeastern Okla...|    USA|      1986|          2|          27| 55| 5.7|16.1|3.1|      16.1|   0.186|0.3229999999999999|               0.1|    

#### Show all players named David

In [44]:
sdf1.filter(sdf.player_name.like('David%')).show(10)

+----------+--------------+-----------------+----+-------------+-------------+----------+-------+----------+-----------+------------+---+----+----+---+----------+--------+------------------+------------------+------------------+------------------+-------+
|Unnamed: 0|   player_name|team_abbreviation| age|player_height|player_weight|   college|country|draft_year|draft_round|draft_number| gp| pts| reb|ast|net_rating|oreb_pct|          dreb_pct|           usg_pct|            ts_pct|           ast_pct| season|
+----------+--------------+-----------------+----+-------------+-------------+----------+-------+----------+-----------+------------+---+----+----+---+----------+--------+------------------+------------------+------------------+------------------+-------+
|       186|David Robinson|              SAS|31.0|        215.9|    106.59412|      Navy|    USA|      1987|          1|           1|  6|17.7| 8.5|1.3|       9.6|   0.161|             0.237|0.3339999999999999|0.5589999999999999|    

#### Show the tallest 3 players, displaying their height in meters

In [51]:
sdf2 = sdf1.withColumn('player_height_meters', sdf1['player_height']/100).sort("player_height_meters", ascending=False)
sdf2.select("player_name","player_height_meters").distinct().show(3)

+--------------+--------------------+
|   player_name|player_height_meters|
+--------------+--------------------+
|Walter McCarty|              2.0828|
| Devean George|               2.032|
|  Tim Hardaway|              1.8288|
+--------------+--------------------+
only showing top 3 rows



####  Display the information of players that come from Mexico

In [33]:
sdf.filter(sdf.country == 'Mexico').show(10) 

+----------+--------------+-----------------+----+-------------+-------------+--------+-------+----------+-----------+------------+---+---+---+---+----------+--------+--------+-------+------------------+------------------+-------+
|Unnamed: 0|   player_name|team_abbreviation| age|player_height|player_weight| college|country|draft_year|draft_round|draft_number| gp|pts|reb|ast|net_rating|oreb_pct|dreb_pct|usg_pct|            ts_pct|           ast_pct| season|
+----------+--------------+-----------------+----+-------------+-------------+--------+-------+----------+-----------+------------+---+---+---+---+----------+--------+--------+-------+------------------+------------------+-------+
|      1765|Eduardo Najera|              DAL|24.0|        203.2|   106.140528|Oklahoma| Mexico|      2000|          2|          38| 40|3.3|2.4|0.7|      -4.9|   0.107|   0.142|  0.146|             0.522|             0.093|2000-01|
|      2624|Eduardo Najera|              DAL|25.0|        203.2|   106.14052

#### Calculate the BMI of each player, insert into the collection as a new column, then display the players according to their BMI descendingly

In [57]:
sdf3 = sdf2.withColumn('bmi', sdf2['player_weight']/sdf2['player_height_meters']**2)
sdf3.select("player_name","bmi").distinct().sort("bmi", ascending=False).show(5)

+----------------+-----------------+
|     player_name|              bmi|
+----------------+-----------------+
|   Oliver Miller|34.82661591142417|
|   Oliver Miller| 33.7550277295342|
|   Oliver Miller|33.21923363858921|
|Shaquille O'Neal|33.08560028884764|
| Zion Williamson|32.81913177896047|
+----------------+-----------------+
only showing top 5 rows



#### Display the number of players from each country, showing the 5 least present countries

In [58]:
country_counts = sdf1.groupby(['country'])\
.agg({"country": "count"})\
.sort("count(country)", ascending=True)\
.show(5)

+-----------+--------------+
|    country|count(country)|
+-----------+--------------+
|Puerto Rico|             1|
|     Angola|             1|
|      Ghana|             1|
|      Gabon|             1|
|     Guinea|             1|
+-----------+--------------+
only showing top 5 rows

