<a href="https://colab.research.google.com/github/odus05/Pyspark/blob/master/03_Pandas_UDFs(nhl_game).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
## Load data from a CSV
# skater_stats
file_location = '/FileStore/tables/game_skater_stats.csv'
df = spark.read.format('csv').option('inferSchema',True).option('header',True).load(file_location)
display(df.take(5))

# player names
file_location = "/FileStore/tables/player_info.csv"
names = spark.read.format('CSV').option('inferSchema', True).option('header',True).load(file_location)
display(names)


## Load data from S3
# df = spark.read.load("s3a://my_bucket/game_skater_stats/*.parquet")

player_id,firstName,lastName,nationality,birthCity,primaryPosition,birthDate,link
8467412,Alexei,Ponikarovsky,UKR,Kiev,LW,1980-04-09T00:00:00.000+0000,/api/v1/people/8467412
8468501,Anton,Volchenkov,RUS,Moscow,D,1982-02-25T00:00:00.000+0000,/api/v1/people/8468501
8459670,Kimmo,Timonen,FIN,Kuopio,D,1975-03-18T00:00:00.000+0000,/api/v1/people/8459670
8471233,Travis,Zajac,CAN,Winnipeg,C,1985-05-13T00:00:00.000+0000,/api/v1/people/8471233
8455710,Martin,Brodeur,CAN,Montreal,G,1972-05-06T00:00:00.000+0000,/api/v1/people/8455710
8475640,Erik,Gustafsson,SWE,Kvissleby,D,1988-12-15T00:00:00.000+0000,/api/v1/people/8475640
8476177,Matt,Read,CAN,Ilderton,RW,1986-06-14T00:00:00.000+0000,/api/v1/people/8476177
8464977,Dainius,Zubrus,LTU,Elektrenai,C,1978-06-16T00:00:00.000+0000,/api/v1/people/8464977
8474190,Wayne,Simmonds,CAN,Scarborough,RW,1988-08-26T00:00:00.000+0000,/api/v1/people/8474190
8464975,Daniel,Briere,CAN,Gatineau,C,1977-10-06T00:00:00.000+0000,/api/v1/people/8464975


In [0]:
## Write data to DBFS
# df.write.save('/FileStore/parquet/game_stats',format='parquet')

## Write data to S3
# df.write.parquet("s3a://my_bucket/game_stats", mode="overwrite")

## Write data to CSV
# df.write.save('/FileStore/parquet/game_stats.csv', format='csv') # FROM DBFS 
# df.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save("s3a://my_bucket/game_sstats.csv") # FROM S3

In [0]:
### Transforming Data
df.createOrReplaceTempView('stats')
spark.sql("""SELECT player_id, sum(1) as games, sum(goals) as goals
             FROM stats
             Group BY 1
             Order BY 3 desc
             limit 5
          """).show(5)

In [0]:
top_players = \
spark.sql("""SELECT player_id, sum(1) as games, sum(goals) as goals 
             FROM stats 
             Group By 1 
             Order By 3 desc 
             limit 5""")

top_players.createOrReplaceTempView('top_players')
names.createOrReplaceTempView('names')

spark.sql("""SELECT p.player_id, goals, firstName, lastName 
             FROM top_players as p 
             join names as n 
             on p.player_id = n.player_id 
             ORDER BY 2 DESC""").show(5)


In [0]:
spark.sql("""SELECT CAST(SUBSTRING(game_id, 1, 4) || '-' || SUBSTRING(game_id, 5, 2) || '-01' as Date) as month,
              sum(goals)/count(distinct game_id) as goals_per_goal
              FROM stats
              GROUP BY 1
              ORDER BY 1""").show(5)

In [0]:
spark.sql('''SELECT player_id, sum(shots) as shots, sum(goals) as goals 
             FROM stats 
             GROUP BY 1 
             HAVING goals>=5''').show()

spark.sql("""SELECT CAST(goals/shots * 50 as int)/50.0 as Goals_per_shot, sum(1) as Players 
             FROM(SELECT player_id, sum(shots) as shots, sum(goals) as goals 
                   FROM stats 
                   GROUP BY 1 
                   HAVING goals >= 5) 
             GROUP BY 1 ORDER BY 1""").show()

In [0]:
### MLlib : Linear Regression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

assembler = VectorAssembler(inputCols=['shots', 'hits', 'assists', 'penaltyMinutes', 'timeOnIce', 'takeaways'], outputCol="features" )
train_df = assembler.transform(df)

lr = LinearRegression(featuresCol='features', labelCol='goals')
lr_model = lr.fit(train_df)

train_summary = lr_model.summary
print("Coefficients: " + str(lr_model.coefficients))
print("RMSE : %.3f" % train_summary.rootMeanSquaredError)
print("R2: %.3f" % train_summary.r2)

In [0]:
# creating a linear fit for a single player
from scipy.optimize import leastsq
import numpy as np

def fit(params, x, y):
  return (y-(params[0] + x * params[1]))

df.createOrReplaceTempView('stats')

sample_pd = spark.sql("""SELECT * FROM stats 
                         WHERE player_id = 8471214""").toPandas()
result = leastsq(fit, [1,0], args=(sample_pd.shots, sample_pd.hits))
print(result)

### Pandas UDFs
# Groupby -> Pandas DF -> UDFs -> Spark DF
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import *
import pandas as pd

# Pandas UDF를 사용하기위한 추가 설정 중 하나는 결과 데이터 프레임에 대한 스키마를 정의하는 것입니다. 여기서 스키마는 적용 단계에서 생성 된 Spark 데이터 프레임의 형식을 설명합니다.
schema = StructType([StructField('ID', LongType(), True),
                     StructField('p0', DoubleType(), True),
                     StructField('p1', DoubleType(), True)])  

  
@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def analyze_player(sample_pd): #UDFs
  if (len(sample_pd.shots) <= 1):
    return pd.DataFrame({'ID': [sample_pd.player_id[0]], 'p0': [ 0 ], 'p1': [ 0 ]})
    
  result = leastsq(fit, [1, 0], args=(sample_pd.shots, sample_pd.hits))
  return pd.DataFrame({'ID': [sample_pd.player_id[0]], 'p0': [result[0][0]], 'p1': [result[0][1]]})

player_df = df.groupby('player_id').apply(analyze_player)
display(player_df.take(5))

ID,p0,p1
8470085,2.344963791971333,-0.15734035549738
8471859,0.552176162823148,0.0221736714004199
8475765,0.7783287624631094,-0.00016742139167102827
8476426,-2.181366198783508e-12,1.6666666666703025
8476439,2.2017087251697496,0.0864601168405109


In [24]:
## Boston
# !pip3 install pyspark
### SparkSession 객체 생성 후, 데이터 Load, 데이터 확인하기
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import *

spark = SparkSession.builder.appName("Spark ML on boston data").getOrCreate()

# load the boston data set
from sklearn.datasets import load_boston
boston = load_boston()

# convert to a Pandas Data Frame
import numpy as np
import pandas as pd
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor as RFR

boston_pd = pd.DataFrame(data= np.c_[boston['data'],boston['target']], 
              columns= np.append(boston['feature_names'], 'target')).sample(frac=1)
print(boston_pd.shape)

# convert to a Spark data frame
boston_sp = spark.createDataFrame(boston_pd)
display(boston_sp.take(5))

# setup the spark data frame as a table
boston_sp.createOrReplaceTempView("boston")

# add train/test label and expand the data set by 3x (each num trees parameter)
full_df = spark.sql("""
  select *
  from (
    select *, case when rand() < 0.8 then 1 else 0 end as training 
    from boston
  ) b
  cross join (
      select 11 as trees union all select 20 as trees union all select 50 as trees)
""")
display(full_df)

(506, 14)


[Row(CRIM=0.13554, ZN=12.5, INDUS=6.07, CHAS=0.0, NOX=0.409, RM=5.594, AGE=36.8, DIS=6.498, RAD=4.0, TAX=345.0, PTRATIO=18.9, B=396.9, LSTAT=13.09, target=17.4),
 Row(CRIM=6.39312, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.584, RM=6.162, AGE=97.4, DIS=2.206, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=302.76, LSTAT=24.1, target=13.3),
 Row(CRIM=4.42228, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.584, RM=6.003, AGE=94.5, DIS=2.5403, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=331.29, LSTAT=21.32, target=19.1),
 Row(CRIM=0.03768, ZN=80.0, INDUS=1.52, CHAS=0.0, NOX=0.404, RM=7.274, AGE=38.3, DIS=7.309, RAD=2.0, TAX=329.0, PTRATIO=12.6, B=392.2, LSTAT=6.62, target=34.6),
 Row(CRIM=0.02055, ZN=85.0, INDUS=0.74, CHAS=0.0, NOX=0.41, RM=6.383, AGE=35.7, DIS=9.1876, RAD=2.0, TAX=313.0, PTRATIO=17.3, B=396.9, LSTAT=5.77, target=24.7)]

DataFrame[CRIM: double, ZN: double, INDUS: double, CHAS: double, NOX: double, RM: double, AGE: double, DIS: double, RAD: double, TAX: double, PTRATIO: double, B: double, LSTAT: double, target: double, training: int, trees: int]

In [18]:
# setup the spark data frame as a table
boston_sp.createOrReplaceTempView("boston")

# add train/test label and expand the data set by 3x (each num trees parameter)
full_df = spark.sql("""select * from (select *, case when rand()<0.8 then 1 else 0 end as training from boston) b
                       cross join(select 11 as trees union all select 20 as trees union all select 50 as trees)""")

schema = StructType([StructField('trees', LongType(), True),
                     StructField('r_squared', DoubleType(), True)])

@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def train_RF(pd):
  trees = boston_pd['trees'].unique()[0]

  # get the train and test groups
  train = boston_pd[boston_pd['training']==1]
  test = boston_pd[boston_pd['training']==0]

  # create data and label groups 
  X_train = train.drop(['target'],axis=1)
  y_train = train['target']
  X_test = test.drop(['target'], axis=1)
  y_test = test['target']

  # train a classifier 
  rf = RFR(n_estimators=trees)
  model = rf.fit(X_train, y_train)

  # make predictions
  y_pred = model.predict(X_test)
  r = pearsonr(y_pred, y_test)

  # return the number of trees, and the R value 
  return pd.DataFrame({'trees': trees, 'r_squared': (r[0]**2)}, index=[0])

# use the Pandas UDF
results = full_df.groupby('trees').apply(train_RF)

# print the results 
display(results)

DataFrame[trees: bigint, r_squared: double]