In [1]:
import findspark
findspark.init()
import pyspark
import random
from pyspark.sql import SparkSession
import pyspark.sql
sc = pyspark.SparkContext(appName="NBA")

### Best scorers

In [2]:
from pyspark.sql.functions import *
spark = SparkSession(sc)
players = spark.read.csv(path='Player Totals.csv',
                        sep=',',
                        encoding='UTF-8',
                        comment=None,
                        header=True, 
                        inferSchema=True)
players = players.withColumn("pts",players.pts.cast('int'))
players = players.withColumn("x3pa",players.x3pa.cast('int'))
players = players.withColumn("mp",players.mp.cast('int'))
players.filter(players.season>=1980.0).groupBy('player').sum('pts').orderBy(desc("sum(pts)"), "player").show()


+-----------------+--------+
|           player|sum(pts)|
+-----------------+--------+
|     LeBron James|   37062|
|      Karl Malone|   36928|
|      Kobe Bryant|   33643|
|   Michael Jordan|   32292|
|    Dirk Nowitzki|   31560|
|  Carmelo Anthony|   30259|
|    Eddie Johnson|   29779|
| Shaquille O'Neal|   29428|
|     Vince Carter|   28636|
|Dominique Wilkins|   28591|
|    Allen Iverson|   27457|
|  Hakeem Olajuwon|   26946|
|       Tim Duncan|   26496|
|      Paul Pierce|   26397|
|    Kevin Garnett|   26394|
|        Ray Allen|   26218|
|     James Harden|   25992|
|     Kevin Durant|   25526|
|    Reggie Miller|   25279|
|    Patrick Ewing|   24818|
+-----------------+--------+
only showing top 20 rows



### Average number of 3 point attempts for each season

In [3]:
players.filter(players.season>=1980.0).groupBy('season').agg((sum("x3pa")/sum("mp")*10*48).alias('attempts')).orderBy(asc("season")).show(43)

+------+------------------+
|season|          attempts|
+------+------------------+
|  1980|5.4671914061533124|
|  1981|4.1234593389351115|
|  1982|4.5528362515307546|
|  1983| 4.419448170493302|
|  1984| 4.742948486567318|
|  1985|  6.18398593301025|
|  1986| 6.731299146448014|
|  1987|  9.23923006416132|
|  1988|10.154621093580644|
|  1989|12.989974796975638|
|  1990|12.892851584835519|
|  1991|14.080097336337367|
|  1992|15.035523180807061|
|  1993| 17.66776514775323|
|  1994| 19.43920237739366|
|  1995|30.694933412241596|
|  1996|31.264142865879762|
|  1997| 33.39959936822235|
|  1998|25.645966673157005|
|  1999|26.214823280788387|
|  2000|27.239988311439674|
|  2001|26.398075380914193|
|  2002| 29.48435581315389|
|  2003|29.282990573064914|
|  2004|  29.1770800325145|
|  2005|31.852439616045565|
|  2006|31.872750716288024|
|  2007|33.727102317187565|
|  2008| 35.55370870558903|
|  2009| 36.63350187571359|
|  2010| 35.95124536301007|
|  2011| 36.98485320628005|
|  2012| 36.22693295

In [4]:
three_point = players.filter(players.season>=1980.0).groupBy('season').agg((sum("x3pa")/sum("mp")*10*48).alias('attempts')).orderBy(asc("season"))
three_point = three_point.withColumn("attempts",three_point.attempts.cast('float'))

In [5]:
from pyspark.ml.feature import VectorAssembler
last17 = spark.createDataFrame(three_point.tail(17))
vectorAssembler = VectorAssembler(inputCols = ['season'], outputCol = 'features')
vhouse_df = vectorAssembler.transform(last17)
vhouse_df = vhouse_df.select(['features', 'attempts'])

In [6]:
import pyspark.ml.regression as pmr
lr = pmr.LinearRegression(featuresCol = 'features', labelCol='attempts')
lr_model = lr.fit(vhouse_df)
years_forward = list(range(2023,2028))
preds = []
for i in years_forward:
    preds.append(i*lr_model.coefficients[0]+lr_model.intercept)

In [7]:
columns = ['season','attempts']
for l in range(5):
    a = years_forward[l]
    b = float(preds[l])
    newRow = spark.createDataFrame([(a,b)],columns)
    three_point = three_point.union(newRow)

### Prediction for 5 next seasons

In [8]:
three_point.show(48)

+------+------------------+
|season|          attempts|
+------+------------------+
|  1980| 5.467191219329834|
|  1981| 4.123459339141846|
|  1982|4.5528364181518555|
|  1983| 4.419448375701904|
|  1984| 4.742948532104492|
|  1985| 6.183985710144043|
|  1986| 6.731298923492432|
|  1987| 9.239230155944824|
|  1988|10.154621124267578|
|  1989|12.989974975585938|
|  1990|12.892851829528809|
|  1991|14.080097198486328|
|  1992|15.035523414611816|
|  1993| 17.66776466369629|
|  1994|  19.4392032623291|
|  1995| 30.69493293762207|
|  1996|31.264142990112305|
|  1997|33.399600982666016|
|  1998|25.645967483520508|
|  1999| 26.21482276916504|
|  2000|27.239988327026367|
|  2001|26.398075103759766|
|  2002|29.484355926513672|
|  2003|29.282991409301758|
|  2004|29.177080154418945|
|  2005|31.852439880371094|
|  2006|31.872751235961914|
|  2007| 33.72710418701172|
|  2008|35.553707122802734|
|  2009| 36.63350296020508|
|  2010| 35.95124435424805|
|  2011|  36.9848518371582|
|  2012|36.226932525