In [1]:
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.context import SQLContext

In [2]:
spark = SparkSession.builder.appName('spark-sql').master('local').getOrCreate()
sqlContext = SQLContext(spark)
filepath = 'data/Seasons_Stats.csv'
df = sqlContext.read.load(filepath, format='com.databricks.spark.csv', header='true',inferSchema='true')

In [3]:
df.registerTempTable('seasons_stats')

Data description

In [4]:
describe = sqlContext.sql("describe seasons_stats")
describe.show(n = 100)

+--------+---------+-------+
|col_name|data_type|comment|
+--------+---------+-------+
|     _c0|      int|   null|
|    Year|      int|   null|
|  Player|   string|   null|
|     Pos|   string|   null|
|     Age|      int|   null|
|      Tm|   string|   null|
|       G|      int|   null|
|      GS|      int|   null|
|      MP|      int|   null|
|     PER|   double|   null|
|     TS%|   double|   null|
|    3PAr|   double|   null|
|     FTr|   double|   null|
|    ORB%|   double|   null|
|    DRB%|   double|   null|
|    TRB%|   double|   null|
|    AST%|   double|   null|
|    STL%|   double|   null|
|    BLK%|   double|   null|
|    TOV%|   double|   null|
|    USG%|   double|   null|
|   blanl|   string|   null|
|     OWS|   double|   null|
|     DWS|   double|   null|
|      WS|   double|   null|
|   WS/48|   double|   null|
|  blank2|   string|   null|
|    OBPM|   double|   null|
|    DBPM|   double|   null|
|     BPM|   double|   null|
|    VORP|   double|   null|
|      FG|    

Example of the first row

In [5]:
df.show(1, vertical = True)

-RECORD 0-----------------
 _c0    | 0               
 Year   | 1950            
 Player | Curly Armstrong 
 Pos    | G-F             
 Age    | 31              
 Tm     | FTW             
 G      | 63              
 GS     | null            
 MP     | null            
 PER    | null            
 TS%    | 0.368           
 3PAr   | null            
 FTr    | 0.467           
 ORB%   | null            
 DRB%   | null            
 TRB%   | null            
 AST%   | null            
 STL%   | null            
 BLK%   | null            
 TOV%   | null            
 USG%   | null            
 blanl  | null            
 OWS    | -0.1            
 DWS    | 3.6             
 WS     | 3.5             
 WS/48  | null            
 blank2 | null            
 OBPM   | null            
 DBPM   | null            
 BPM    | null            
 VORP   | null            
 FG     | 144             
 FGA    | 516             
 FG%    | 0.279           
 3P     | null            
 3PA    | null            
 

### Best scorers

In [6]:
selectall = sqlContext.sql("SELECT Player, sum(2P + 3P) as points from seasons_stats group by Player order by points desc")
selectall.show()

+------------------+------+
|            Player|points|
+------------------+------+
|      Karl Malone*| 13528|
|   Michael Jordan*| 12192|
|     Eddie Johnson| 11896|
|       Kobe Bryant| 11719|
| Shaquille O'Neal*| 11661|
|  Hakeem Olajuwon*| 10749|
|     Dirk Nowitzki| 10688|
|Dominique Wilkins*| 10661|
|     Kevin Garnett| 10648|
|      LeBron James| 10423|
|        Tim Duncan| 10285|
|     Alex English*| 10174|
|      Vince Carter|  9961|
|    Patrick Ewing*|  9702|
|    Allen Iverson*|  9532|
|      Gary Payton*|  9373|
|   Carmelo Anthony|  9300|
|         Ray Allen|  9165|
|    Clyde Drexler*|  8906|
|       Paul Pierce|  8668|
+------------------+------+
only showing top 20 rows



### 3 Points attempts per season

In [11]:
selectall = sqlContext.sql("SELECT Year, AVG(3PA) as average from seasons_stats where 3PA is not null group by Year  order by Year")
selectall.show(n = 100)

+----+------------------+
|Year|           average|
+----+------------------+
|1980|15.467787114845938|
|1981|11.462809917355372|
|1982|12.479892761394101|
|1983|11.744245524296675|
|1984|13.408163265306122|
|1985|16.596685082872927|
|1986| 17.61741424802111|
|1987| 24.27777777777778|
|1988|25.199530516431924|
|1989|  33.0958904109589|
|1990| 33.46623093681917|
|1991| 37.61224489795919|
|1992| 38.06331877729258|
|1993| 45.39866369710467|
|1994| 47.88149688149688|
|1995| 79.36725663716814|
|1996| 76.21100917431193|
|1997| 76.67421602787456|
|1998| 62.20475319926874|
|1999| 39.96252465483235|
|2000| 68.12903225806451|
|2001| 63.85474860335196|
|2002|            75.034|
|2003|  76.8136645962733|
|2004| 67.87692307692308|
|2005| 75.05299145299145|
|2006| 76.19538188277087|
|2007| 85.79263565891473|
|2008| 81.48571428571428|
|2009|  86.8298969072165|
|2010| 85.09342560553634|
|2011|            84.664|
|2012| 69.46823956442832|
|2013| 92.42233856893543|
|2014| 95.64320785597381|
|2015| 97.41

### Prediction of 3 Point attempts

In [8]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [12]:
vectorAssembler = VectorAssembler(inputCols = ['Year'], outputCol = 'features')
v_df = vectorAssembler.transform(selectall)
v_df = v_df.select(['features', 'average'])
v_df.show(3)

+--------+------------------+
|features|           average|
+--------+------------------+
|[1980.0]|15.467787114845938|
|[1981.0]|11.462809917355372|
|[1982.0]|12.479892761394101|
+--------+------------------+
only showing top 3 rows



#### Model training

In [13]:
lr = LinearRegression(featuresCol = 'features', labelCol='average')
lr_model = lr.fit(v_df)

#### Future dataset

In [33]:
data = [
    (2018.0,),
    (2019.0,),
    (2020.0,),
    (2021.0,),
    (2022.0,)
]

df_future = spark.createDataFrame(data, ['Year'])

In [38]:
v_df_predict = vectorAssembler.transform(df_future)
v_df_predict = v_df_predict.select(['features'])

In [41]:
v_df_predict.show()

+--------+
|features|
+--------+
|[2018.0]|
|[2019.0]|
|[2020.0]|
|[2021.0]|
|[2022.0]|
+--------+



#### Predictions

In [42]:
predictions = lr_model.transform(v_df_predict)

In [44]:
predictions.show()

+--------+------------------+
|features|        prediction|
+--------+------------------+
|[2018.0]| 110.3489393223499|
|[2019.0]|112.99379759286967|
|[2020.0]|115.63865586338943|
|[2021.0]| 118.2835141339092|
|[2022.0]|120.92837240442896|
+--------+------------------+

