## sparkmlhw02

In [1]:
import pyspark
from pyspark.sql import SparkSession
#生成SparkSession实例
spark = SparkSession.builder \
     .master("local[*]") \
     .appName("sparkmlhw02") \
     .config("spark.some.config.option", "some-value") \
     .getOrCreate()

In [4]:
df1 = spark.read.csv("file:///home/ian/code/data/sparkml/doc_class.dat", sep='|', header=True)
df1.show(5)

+--------+----------+--------+--------------------+--------------------+
|myapp_id|typenameid|typename|          myapp_word|      myapp_word_all|
+--------+----------+--------+--------------------+--------------------+
| 1376533|         2|  action|game, android, world|game, android, wo...|
| 1376542|         2|  action|                game|game, app, enjoy,...|
| 1376603|         2|  action|run, tap, collect...|run, tap, collect...|
| 1376792|         2|  action|                 run|run, ath, game, m...|
| 1376941|         2|  action|fight, game, play...|fight, game, play...|
+--------+----------+--------+--------------------+--------------------+
only showing top 5 rows



In [5]:
df1.first()

Row(myapp_id='1376533', typenameid='2', typename='action', myapp_word='game, android, world', myapp_word_all='game, android, world, control, devic, experi, free, gameplay, play, screen, time, touch, war, action, addict, app, ath, attack, battl, challeng, collect, complet, descript, easi, enemi, enjoy, featur, fight, find, friend, fun, gamec, graphic, great, gun, high, kill, level, make, mission, mode, move, player, power, read, real, run, score, shoot, shooter, simpl, skill, sound, special, surviv, tap, uniqu, upgrad, weapon, zombi')

In [8]:
df1.count()

334500

In [10]:
df1.dtypes

[('myapp_id', 'string'),
 ('typenameid', 'string'),
 ('typename', 'string'),
 ('myapp_word', 'string'),
 ('myapp_word_all', 'string')]

In [11]:
df1.columns

['myapp_id', 'typenameid', 'typename', 'myapp_word', 'myapp_word_all']

In [12]:
df1.explain()

== Physical Plan ==
*(1) FileScan csv [myapp_id#52,typenameid#53,typename#54,myapp_word#55,myapp_word_all#56] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/ian/code/data/sparkml/doc_class.dat], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<myapp_id:string,typenameid:string,typename:string,myapp_word:string,myapp_word_all:string>


In [16]:
spark.sparkContext.setCheckpointDir('/tt123')#hdfs://h1:9200/ttl23

In [17]:
df1.checkpoint()

DataFrame[myapp_id: string, typenameid: string, typename: string, myapp_word: string, myapp_word_all: string]

In [31]:
df1 = df1.withColumnRenamed('typenameid','label')
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType
df1 = df1.withColumn('label', col('label').cast(IntegerType()))

In [32]:
#切分训练集和测试集,会先打乱数据集
train_set, test_set = df1.randomSplit([0.9,0.1])

In [33]:
train_set.count()

301051

In [34]:
train_set.show(5)

+--------+-----+--------+--------------------+--------------------+
|myapp_id|label|typename|          myapp_word|      myapp_word_all|
+--------+-----+--------+--------------------+--------------------+
| 1376501|    4|  arcade|level, game, app,...|level, game, app,...|
| 1376513|    4|  arcade|           app, make|app, make, game, ...|
| 1376533|    2|  action|game, android, world|game, android, wo...|
| 1376542|    2|  action|                game|game, app, enjoy,...|
| 1376565|    4|  arcade|          game, athi|game, athi, app, ...|
+--------+-----+--------+--------------------+--------------------+
only showing top 5 rows



In [1]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import Row
from pyspark.ml.linalg import Vector

In [35]:
%%time
tokenizer = Tokenizer(inputCol='myapp_word_all', outputCol='words')
hashingTF = HashingTF(numFeatures=1000, inputCol='words', outputCol='features')

lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
model = pipeline.fit(train_set)

CPU times: user 6 µs, sys: 2 µs, total: 8 µs
Wall time: 15.3 µs


In [36]:
p = model.transform(test_set)
p.select("myapp_id","label","typename","probability","prediction").show()

+--------+-----+--------+--------------------+----------+
|myapp_id|label|typename|         probability|prediction|
+--------+-----+--------+--------------------+----------+
| 1376490|    4|  arcade|[6.67411865258183...|       4.0|
| 1376579|    4|  arcade|[5.90032797772139...|       4.0|
| 1376603|    2|  action|[2.05749718404754...|       2.0|
| 1376684|    4|  arcade|[6.67411865258183...|       4.0|
| 1376912|    4|  arcade|[6.67411865258183...|       4.0|
| 1376983|    4|  arcade|[6.67411865258183...|       4.0|
| 1377059|    4|  arcade|[6.67411865258183...|       4.0|
| 1377084|    4|  arcade|[6.67411865258183...|       4.0|
| 1377104|    4|  arcade|[6.67411865258183...|       4.0|
| 1377220|    2|  action|[2.05749718404754...|       2.0|
| 1377321|    2|  action|[2.05749718404754...|       2.0|
| 1377943|    4|  arcade|[5.90032797772139...|       4.0|
| 1378422|    4|  arcade|[6.67411865258183...|       4.0|
| 1378849|    4|  arcade|[6.67411865258183...|       4.0|
| 1379087|    

In [37]:
#模型保存和加载
model.save('file:///tmp/testModel')
savedModel = PipelineModel.load('file:///tmp/testModel')
pipeline.save('file:///tmp/pp')
pp = Pipeline.load('file:///tmp/pp')