In [2]:
import pyspark
from pyspark.sql import SparkSession

# sparkml API

## 生成SparkSession实例

In [3]:
spark = SparkSession.builder \
     .master("local[*]") \
     .appName("Word Count") \
     .config("spark.some.config.option", "some-value") \
     .getOrCreate()

# examples & api

## sparkmlhw01
采用Datasets操作，实现WordCount实例，并且按照count值降序显示前50行数据，其中word转换成小写，去除标点符号，去除停用词，考查点：  
1）  spark读取文件  
2）  dataset转换操作、聚合操作  
重点在数据清洗，转成小写，去除标点、停用词等，这里需要自己自定义停用词集合 和 标点符号集合  
3）  dataset排序及显示  

In [6]:
df1 = spark.read.csv("file:///Users/luoyonggui/PycharmProjects/mayiexamples/sparkml/data/wordcount", sep='|', header=False)
type(df1)

pyspark.sql.dataframe.DataFrame

### show(n=20, truncate=True, vertical=False)

In [7]:
df1.show(5)

+---+--------------------+
|_c0|                 _c1|
+---+--------------------+
|  1|The Apache Hadoop...|
|  2|Apache Spark is a...|
|  3|Apache Storm is a...|
|  4|Below is a high-l...|
+---+--------------------+



In [8]:
df1.show(1, truncate=False)

+---+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|_c0|_c1                                                                                                                                                                                                                                                                                                                                                                                                                                                

### withColumnRenamed(existing, new)

In [9]:
df1 = df1.withColumnRenamed('_c0','index').withColumnRenamed('_c1','content')

In [10]:
df1.dtypes

[('index', 'string'), ('content', 'string')]

### select(*cols)

In [11]:
df2 = df1.select('content')

In [12]:
df2.show(5)

+--------------------+
|             content|
+--------------------+
|The Apache Hadoop...|
|Apache Spark is a...|
|Apache Storm is a...|
|Below is a high-l...|
+--------------------+



### DataFrame转换rdd

In [13]:
rdd1 = df2.rdd.flatMap(lambda x:x[0].split())

### first()

In [14]:
rdd1.first()

'The'

### rdd1.take(num)

In [15]:
rdd1.take(5)

['The', 'Apache', 'Hadoop', 'software', 'library']

In [16]:
rdd2 = rdd1.map(lambda x:x.lower())\
    .filter(lambda x: x not in list(",.:!';"))\
    .filter(lambda x: x not in ["the","of","a","to","that","it","for","is","and"])\
    .map(lambda x:(x,1))
rdd2.take(5)

[('apache', 1),
 ('hadoop', 1),
 ('software', 1),
 ('library', 1),
 ('framework', 1)]

### rdd2.reduceByKey

In [17]:
# rdd3 = rdd2.reduceByKey(lambda a,b:a+b)
# rdd3.take(5)

In [18]:
from operator import add
rdd3 = rdd2.reduceByKey(add)
rdd3.take(5)

[('apache', 4),
 ('hadoop', 2),
 ('software', 1),
 ('library', 2),
 ('framework', 1)]

### rdd转DataFrame

In [19]:
# df3 = spark.createDataFrame(rdd3)
# df3.show(5)

In [20]:
df3 = rdd3.toDF()
df3.show(5)

+---------+---+
|       _1| _2|
+---------+---+
|   apache|  4|
|   hadoop|  2|
| software|  1|
|  library|  2|
|framework|  1|
+---------+---+
only showing top 5 rows



### sort(*cols, **kwargs)

In [24]:
df3.sort(df3['_2'].desc()).show()

+-----------+---+
|         _1| _2|
+-----------+---+
|     apache|  4|
|      storm|  3|
|      spark|  3|
|     hadoop|  2|
|    library|  2|
|distributed|  2|
|       data|  2|
|   designed|  2|
|computation|  2|
| processing|  2|
|    system.|  2|
|programming|  2|
|        sql|  2|
|       each|  2|
|         on|  2|
|    cluster|  2|
|         be|  2|
| high-level|  2|
|         in|  2|
|   supports|  2|
+-----------+---+
only showing top 20 rows



## sparkmlhw02
采用ML Pipelines构建一个文档分类器，需要将模型进行保存，并且加载模型后对测试样本进行预测，考查点：

1）  spark读取文件

2）  数据清洗，考查Datasets的基本操作

3）  构建分类器的管道，考查构建各种转换操作

4）  读取模型，读取测试数据，并且进行模型测试

 

数据格式：

myapp_id|typenameid|typename|myapp_word|myapp_word_all

 

其中文档ID字段为：myapp_id

其中文档类别字段为：typenameid

其中文档内容为：myapp_word_all

In [21]:
df1 = spark.read.csv("file:///Users/luoyonggui/PycharmProjects/mayiexamples/sparkml/data/doc_class.dat", sep='|', header=True)
df1.show(5)

+--------+----------+--------+--------------------+--------------------+
|myapp_id|typenameid|typename|          myapp_word|      myapp_word_all|
+--------+----------+--------+--------------------+--------------------+
| 1376533|         2|  action|game, android, world|game, android, wo...|
| 1376542|         2|  action|                game|game, app, enjoy,...|
| 1376603|         2|  action|run, tap, collect...|run, tap, collect...|
| 1376792|         2|  action|                 run|run, ath, game, m...|
| 1376941|         2|  action|fight, game, play...|fight, game, play...|
+--------+----------+--------+--------------------+--------------------+
only showing top 5 rows



### count() 
Returns the number of rows in this :class:`DataFrame`.

In [22]:
df1.count()

334500

### dtypes

In [23]:
df1.dtypes

[('myapp_id', 'string'),
 ('typenameid', 'string'),
 ('typename', 'string'),
 ('myapp_word', 'string'),
 ('myapp_word_all', 'string')]

### columns

In [24]:
df1.columns

['myapp_id', 'typenameid', 'typename', 'myapp_word', 'myapp_word_all']

### checkpoint()

In [25]:
spark.sparkContext.setCheckpointDir('CheckpointDir/tt123')

In [26]:
df1.checkpoint()

DataFrame[myapp_id: string, typenameid: string, typename: string, myapp_word: string, myapp_word_all: string]

### 强制类型转换

In [27]:
df1 = df1.withColumnRenamed('typenameid','label')
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType
df1 = df1.withColumn('label', col('label').cast(IntegerType()))

In [28]:
df1.dtypes

[('myapp_id', 'string'),
 ('label', 'int'),
 ('typename', 'string'),
 ('myapp_word', 'string'),
 ('myapp_word_all', 'string')]

### 切分训练集和测试集,会先打乱数据集

In [29]:
#切分训练集和测试集,会先打乱数据集
train_set, test_set = df1.randomSplit([0.9,0.1])

In [30]:
train_set.count()

301005

In [31]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import Row
from pyspark.ml.linalg import Vector

### 模型训练Pipeline  LogisticRegression HashingTF, Tokenizer

In [36]:
from time import time
start = time()
tokenizer = Tokenizer(inputCol='myapp_word_all', outputCol='words')
hashingTF = HashingTF(numFeatures=1000, inputCol='words', outputCol='features')

lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
model = pipeline.fit(train_set)
print(f'{time()-start}s')

17.490482807159424s


### 模型预测

In [37]:
p = model.transform(test_set)
p.select("myapp_id","label","typename","probability","prediction").show()

+--------+-----+-----------------+--------------------+----------+
|myapp_id|label|         typename|         probability|prediction|
+--------+-----+-----------------+--------------------+----------+
| 1376501|    4|           arcade|[4.11158459184166...|       4.0|
| 1376523|   14|           casual|[2.94084062076931...|      14.0|
| 1376606|    4|           arcade|[6.08513932347101...|       4.0|
| 1376635|    4|           arcade|[6.08513932347101...|       4.0|
| 1376656|   16|    communication|[1.78901353628456...|      16.0|
| 1376782|   14|           casual|[1.46631980338166...|      14.0|
| 1377005|   16|    communication|[1.78901353628456...|      16.0|
| 1377059|    4|           arcade|[6.08513932347101...|       4.0|
| 1377153|   16|    communication|[1.78901353628456...|      16.0|
| 1377192|    7|books & reference|[1.21296495436360...|       7.0|
| 1377195|    4|           arcade|[5.37814702677680...|       4.0|
| 1377499|   16|    communication|[1.78901353628456...|      1

### 模型保存和加载

In [38]:
#模型保存和加载
model.save('file:///tmp/testModel')

In [39]:
!ls /tmp/testModel

[1m[36mmetadata[m[m [1m[36mstages[m[m


In [40]:
! open /tmp/testModel

In [41]:
savedModel = PipelineModel.load('file:///tmp/testModel')

In [42]:
p = savedModel.transform(test_set)
p.select("myapp_id","label","typename","probability","prediction").show(5)

+--------+-----+-------------+--------------------+----------+
|myapp_id|label|     typename|         probability|prediction|
+--------+-----+-------------+--------------------+----------+
| 1376501|    4|       arcade|[4.11158459184166...|       4.0|
| 1376523|   14|       casual|[2.94084062076931...|      14.0|
| 1376606|    4|       arcade|[6.08513932347101...|       4.0|
| 1376635|    4|       arcade|[6.08513932347101...|       4.0|
| 1376656|   16|communication|[1.78901353628456...|      16.0|
+--------+-----+-------------+--------------------+----------+
only showing top 5 rows



### pipeline保存和加载

In [43]:
pipeline.save('file:///tmp/pp')

In [46]:
!ls /tmp/pp

[1m[36mmetadata[m[m [1m[36mstages[m[m


In [48]:
pp = Pipeline.load('file:///tmp/pp')

## sparkmlhw03
第3次课的作业和第2次的课一样，只是需要采用交叉验证的方法来做，并且还要计算各种分类指标，这里要分2次实验，第1次设计1个2元分类器，第2次设计1个多元分类器。

 

采用ML Pipelines构建一个文档分类器，需要将模型进行保存，并且加载模型后对测试样本进行预测，考查点：

1）  spark读取文件

2）  数据清洗，考查Datasets的基本操作

3）  构建分类器的管道，考查构建各种转换操作

4）  读取模型，读取测试数据，并且进行模型测试

5）  重点：自己设置交叉验证的网格参数，采用交叉验证的模型来做

6）  重点：计算分类结果的指标

 

数据格式：

myapp_id|typenameid|typename|myapp_word|myapp_word_all

 

其中文档ID字段为：myapp_id

其中文档类别字段为：typenameid

其中文档内容为：myapp_word_all

In [51]:
df1 = spark.read.csv("file:///Users/luoyonggui/PycharmProjects/mayiexamples/sparkml/data/doc_class.dat", sep='|', header=True)
df1.show(5)

+--------+----------+--------+--------------------+--------------------+
|myapp_id|typenameid|typename|          myapp_word|      myapp_word_all|
+--------+----------+--------+--------------------+--------------------+
| 1376533|         2|  action|game, android, world|game, android, wo...|
| 1376542|         2|  action|                game|game, app, enjoy,...|
| 1376603|         2|  action|run, tap, collect...|run, tap, collect...|
| 1376792|         2|  action|                 run|run, ath, game, m...|
| 1376941|         2|  action|fight, game, play...|fight, game, play...|
+--------+----------+--------+--------------------+--------------------+
only showing top 5 rows



### collect()
Returns all the records as a list of :class:`Row`.

In [57]:
df1.collect()[:5]

[Row(myapp_id='1376533', typenameid='2', typename='action', myapp_word='game, android, world', myapp_word_all='game, android, world, control, devic, experi, free, gameplay, play, screen, time, touch, war, action, addict, app, ath, attack, battl, challeng, collect, complet, descript, easi, enemi, enjoy, featur, fight, find, friend, fun, gamec, graphic, great, gun, high, kill, level, make, mission, mode, move, player, power, read, real, run, score, shoot, shooter, simpl, skill, sound, special, surviv, tap, uniqu, upgrad, weapon, zombi'),
 Row(myapp_id='1376542', typenameid='2', typename='action', myapp_word='game', myapp_word_all='game, app, enjoy, free, high, play, run, action, addict, android, ath, attack, battl, challeng, collect, complet, control, descript, devic, easi, enemi, experi, featur, fight, find, friend, fun, gamec, gameplay, graphic, great, gun, kill, level, make, mission, mode, move, player, power, read, real, score, screen, shoot, shooter, simpl, skill, sound, special, su

### distinct()

In [52]:
df1.select('typenameid').distinct().count()

46

### groupby()

In [58]:
df1.groupBy('typenameid').agg({'myapp_id': 'count'}).collect()

[Row(typenameid='7', count(myapp_id)=13122),
 Row(typenameid='15', count(myapp_id)=1785),
 Row(typenameid='11', count(myapp_id)=3828),
 Row(typenameid='29', count(myapp_id)=11588),
 Row(typenameid='42', count(myapp_id)=1439),
 Row(typenameid='3', count(myapp_id)=2181),
 Row(typenameid='30', count(myapp_id)=10974),
 Row(typenameid='34', count(myapp_id)=15295),
 Row(typenameid='8', count(myapp_id)=245),
 Row(typenameid='22', count(myapp_id)=10084),
 Row(typenameid='28', count(myapp_id)=587),
 Row(typenameid='16', count(myapp_id)=9981),
 Row(typenameid='35', count(myapp_id)=3148),
 Row(typenameid='47', count(myapp_id)=2582),
 Row(typenameid='43', count(myapp_id)=26346),
 Row(typenameid='5', count(myapp_id)=213),
 Row(typenameid='31', count(myapp_id)=17319),
 Row(typenameid='18', count(myapp_id)=2782),
 Row(typenameid='27', count(myapp_id)=5766),
 Row(typenameid='17', count(myapp_id)=17553),
 Row(typenameid='26', count(myapp_id)=6107),
 Row(typenameid='46', count(myapp_id)=1444),
 Row(type

In [53]:
tokenizer = Tokenizer(inputCol='myapp_word_all', outputCol='words')
hashingTF = HashingTF(inputCol='words', outputCol='features')
df2 = df1.withColumnRenamed('typenameid','label').withColumn('label', col('label').cast(IntegerType()))
#切分训练集和测试集,会先打乱数据集
train_set, test_set = df2.randomSplit([0.9,0.1])
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

### 网格调参grid

In [59]:
from pyspark.ml.tuning import ParamGridBuilder
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures,[10, 100, 1000]).addGrid(lr.regParam, [0.1, 0.01]).build()

### 二元分类

In [60]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()

### 交叉验证

In [61]:
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=2)

In [62]:
start = time()
cvModel = cv.fit(train_set)
print(time()-start)

175.16829204559326


In [64]:
p = cvModel.transform(test_set)
p.select(['label', 'rawPrediction', 'probability', 'prediction']).show()

+-----+--------------------+--------------------+----------+
|label|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+----------+
|   16|[-7.4650506355219...|[3.28917268712459...|      43.0|
|    2|[-7.4655121275154...|[3.76707131453514...|      31.0|
|   16|[-7.4650506355219...|[3.28917268712459...|      43.0|
|    4|[-7.4650726649511...|[3.09555661679601...|      43.0|
|   14|[-7.4646128253141...|[3.53625473567110...|      43.0|
|    4|[-7.4650726649511...|[3.09555661679601...|      43.0|
|   11|[-7.4654049429803...|[3.73730792079566...|      29.0|
|    4|[-7.4650726649511...|[3.09555661679601...|      43.0|
|   11|[-7.4654049429803...|[3.73730792079566...|      29.0|
|   16|[-7.4650400569118...|[3.49055120567496...|      43.0|
|    2|[-7.4655227447676...|[3.78054958781725...|      31.0|
|    4|[-7.4650889276234...|[2.81868035613385...|       4.0|
|   14|[-7.4645965626417...|[3.47603574364158...|      14.0|
|   17|[-7.4650890332807

In [70]:
evaluator.evaluate(p)

1.0

### 多元分类

In [66]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator1 = MulticlassClassificationEvaluator()

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator2 = MulticlassClassificationEvaluator(metricName='accuracy')

In [67]:
cv2 = CrossValidator(estimator=pipeline, evaluator=evaluator1, estimatorParamMaps=paramGrid, numFolds=2)

cv3 = CrossValidator(estimator=pipeline, evaluator=evaluator2, estimatorParamMaps=paramGrid, numFolds=2)

In [68]:
start = time()
cvModel2 = cv2.fit(train_set)
p2 = cvModel2.transform(test_set)
p2.show()
print(time()-start)

+--------+-----+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|myapp_id|label|         typename|          myapp_word|      myapp_word_all|               words|            features|       rawPrediction|         probability|prediction|
+--------+-----+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
| 1376505|   16|    communication|sms, messag, free...|sms, messag, free...|[sms,, messag,, f...|(1000,[39,52,57,5...|[-7.4647759316632...|[1.65150546698479...|      16.0|
| 1376533|    2|           action|game, android, world|game, android, wo...|[game,, android,,...|(1000,[10,39,40,4...|[-7.4647958040277...|[1.07198843263364...|       2.0|
| 1376604|   16|    communication|app, phone, devic...|app, phone, devic...|[app,, phone,, de...|(1000,[39,52,57,5...|[-7.4647759316632...|[

In [69]:
start = time()
cvModel3 = cv3.fit(train_set)
p3 = cvModel3.transform(test_set)
p3.show()
print(time()-start)

+--------+-----+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|myapp_id|label|         typename|          myapp_word|      myapp_word_all|               words|            features|       rawPrediction|         probability|prediction|
+--------+-----+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
| 1376505|   16|    communication|sms, messag, free...|sms, messag, free...|[sms,, messag,, f...|(1000,[39,52,57,5...|[-7.4647759316632...|[1.65150546698479...|      16.0|
| 1376533|    2|           action|game, android, world|game, android, wo...|[game,, android,,...|(1000,[10,39,40,4...|[-7.4647958040277...|[1.07198843263364...|       2.0|
| 1376604|   16|    communication|app, phone, devic...|app, phone, devic...|[app,, phone,, de...|(1000,[39,52,57,5...|[-7.4647759316632...|[