# 录屏代码

以下代码与“数值特征”的录频中使用的代码相同。运行每个代码单元以查看

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, VectorAssembler, Normalizer, StandardScaler
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

import re

In [None]:
# create a SparkSession: note this step was left out of the screencast
spark = SparkSession.builder \
    .master("local") \
    .appName("Word Count") \
    .getOrCreate()

# 如何读取数据集

In [None]:
stack_overflow_data = 'Train_onetag_small.json'

In [None]:
df = spark.read.json(stack_overflow_data)

In [None]:
df.head()

# 分词

把字符串分为单独的单词。Spark有一个[Tokenizer]（https://spark.apache.org/docs/latest/ml-features.html#tokenizer） 类以及RegexTokenizer。 后者在分词时有更大的自由度。

In [None]:
# split the body text into separate words

regexTokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W")
df = regexTokenizer.transform(df)
df.head()

In [None]:
# count the number of words in each body tag

body_length = udf(lambda x: len(x), IntegerType())
df = df.withColumn("BodyLength", body_length(df.words))

In [None]:
# count the number of paragraphs and links in each body tag

number_of_paragraphs = udf(lambda x: len(re.findall("</p>", x)), IntegerType())
number_of_links = udf(lambda x: len(re.findall("</a>", x)), IntegerType())

In [None]:
df = df.withColumn("NumParagraphs", number_of_paragraphs(df.Body))
df = df.withColumn("NumLinks", number_of_links(df.Body))

In [None]:
df.head(2)

# VectorAssembler

将内容长度，段落数和内容中的链接数合并为一个向量

In [None]:
assembler = VectorAssembler(inputCols=["BodyLength", "NumParagraphs", "NumLinks"], outputCol="NumFeatures")
df = assembler.transform(df)

In [None]:
df.head()

# 归一化向量

In [None]:
scaler = Normalizer(inputCol="NumFeatures", outputCol="ScaledNumFeatures")
df = scaler.transform(df)

In [1]:
df.head(2)

NameError: name 'df' is not defined

# 缩放向量

In [None]:
scaler2 = StandardScaler(inputCol="NumFeatures", outputCol="ScaledNumFeatures2", withStd=True)
scalerModel = scaler2.fit(df)
df = scalerModel.transform(df)

In [None]:
df.head(2)