## Initialization

Import findspark and initialize, then import pyspark and sparkSession and create a Spark Session

In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("spark://10.0.2.8:7077").appName("Datamake").getOrCreate()

Create schema and Dataframe

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType,\
                                FloatType, LongType, DecimalType
schema = StructType([ \
                     StructField('index', StringType(), True), \
                     StructField('app_id', LongType(), True), \
                     StructField('app_name', StringType(), True), \
                     StructField('review_id', LongType(), True), \
                     StructField('language', StringType(), True), \
                     StructField('review', StringType(), True), \
                     StructField('timestamp_created', LongType(), True), \
                     StructField('timestamp_updated', LongType(), True), \
                     StructField('recommended', BooleanType(), True), \
                     StructField('votes_helpful', IntegerType(), True), \
                     StructField('votes_funny', IntegerType(), True), \
                     StructField('weighted_vote_score', FloatType(), True), \
                     StructField('comment_count', IntegerType(), True), \
                     StructField('steam_purchase', BooleanType(), True), \
                     StructField('received_for_free', BooleanType(), True), \
                     StructField('written_during_early_access', BooleanType(), True), \
                     StructField('author_steamid', LongType(), True), \
                     StructField('author_num_games_owned', IntegerType(), True), \
                     StructField('author_num_reviews', IntegerType(), True), \
                     StructField('author_playtime_forever', DecimalType(), True), \
                     StructField('author_playtime_last_two_weeks', DecimalType(), True), \
                     StructField('author_playtime_at_review', DecimalType(), True), \
                     StructField('author_last_played', FloatType(), True), \
                    ])

In [4]:
from pyspark.sql.functions import col

df = spark.read.csv("/home/tolias/Documents/sr_small", header = True, schema = schema, multiLine = True,\
                    lineSep = "\r")
df = df.drop("review", "index", "app_name", "language")
df = df.withColumn("recommended",col("recommended").cast(IntegerType()))
df = df.na.drop()

Prepare data for Classification

In [5]:
from pyspark.ml.feature import VectorAssembler

cols = df.columns
cols.remove("recommended")
assembler = VectorAssembler(inputCols = cols, outputCol = "features")

data = assembler.transform(df)
data = data.select("features", "recommended")
data.show(truncate = False)

+-------------------------------------------------------------------------------------------------------------------------------------------------+-----------+
|features                                                                                                                                         |recommended|
+-------------------------------------------------------------------------------------------------------------------------------------------------+-----------+
|[292030.0,8.5185598E7,1.611381629E9,1.611381629E9,0.0,0.0,0.0,0.0,1.0,0.0,0.0,7.6561199095369536E16,6.0,2.0,1909.0,1448.0,1909.0,1.61134336E9]   |1          |
|[292030.0,8.518525E7,1.61138103E9,1.61138103E9,0.0,0.0,0.0,0.0,1.0,0.0,0.0,7.6561198949504112E16,30.0,10.0,2764.0,2743.0,2674.0,1.611386368E9]   |1          |
|[292030.0,8.5185111E7,1.6113808E9,1.6113808E9,0.0,0.0,0.0,0.0,1.0,0.0,0.0,7.6561199090098992E16,5.0,1.0,1061.0,1061.0,1060.0,1.611383808E9]      |1          |
|[292030.0,8.5184605E7,1.61137997E9,1.61

In [6]:
train, test = data.randomSplit([0.9, 0.1])

In [7]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import time

t = time.time()

gbt = GBTClassifier(labelCol = "recommended", featuresCol = "features")
train.cache()
model = gbt.fit(train)
prediction_test = model.transform(test)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='recommended',\
                                          metricName='accuracy')
evaluator.evaluate(prediction_test)
duration = (time.time() - t)
train.unpersist()
duration

KeyboardInterrupt: 