In [1]:
from pyspark import SparkConf,SparkContext
from pyspark import rdd
conf=SparkConf().setMaster("local[*]").setAppName("First_APP")
sc=SparkContext(conf=conf)

In [6]:
from pyspark.sql import SQLContext
sqlContext=SQLContext(sc)

In [8]:
file_content = sc.textFile('E:/machine_data/dataframe/ab.csv')
df = file_content.map(lambda x:x.split(','))
df.take(2)

[['40920', '8.326976', '0.953952', 'largeDoses'],
 ['14488', '7.153469', '1.673904', 'smallDoses']]

In [9]:
#将数据集转换成dataframe格式
dataset = sqlContext.createDataFrame(df, ['Mileage ', 'Gametime', 'Icecream', 'label'])
dataset.show(5, False)
dataset.printSchema


+--------+---------+--------+----------+
|Mileage |Gametime |Icecream|label     |
+--------+---------+--------+----------+
|40920   |8.326976 |0.953952|largeDoses|
|14488   |7.153469 |1.673904|smallDoses|
|26052   |1.441871 |0.805124|didntLike |
|75136   |13.147394|0.428964|didntLike |
|38344   |1.669788 |0.134296|didntLike |
+--------+---------+--------+----------+



<bound method DataFrame.printSchema of DataFrame[Mileage : string, Gametime: string, Icecream: string, label: string]>

In [11]:
#建立标签label的索引字典，将字符串型的label转换成数值型的label。
label_set = dataset.rdd.map(lambda x: x[3]).distinct().collect()
label_dict = dict()
i = 0
for key in label_set:
    if key not in label_dict.keys():
        label_dict[key ]= i
        i = i+1
label_dict

{'didntLike': 2, 'largeDoses': 0, 'smallDoses': 1}

In [14]:
data = dataset.rdd.map(lambda x: ([x[i] for i in range(3)], label_dict[x[3]])).\
               map(lambda kv: [int(kv[0][0]), float(kv[0][1]), float(kv[0][2]), kv[1]])
data = sqlContext.createDataFrame(data,  ['Mileage ', 'Gametime', 'Icecream', 'label'] )
data.show(5, False)
data.printSchema
#data.selectExpr('Mileage', 'Gametime', 'Icecream', 'label').show()


+--------+---------+--------+-----+
|Mileage |Gametime |Icecream|label|
+--------+---------+--------+-----+
|40920   |8.326976 |0.953952|0    |
|14488   |7.153469 |1.673904|1    |
|26052   |1.441871 |0.805124|2    |
|75136   |13.147394|0.428964|2    |
|38344   |1.669788 |0.134296|2    |
+--------+---------+--------+-----+



<bound method DataFrame.printSchema of DataFrame[Mileage : bigint, Gametime: double, Icecream: double, label: bigint]>

现在数据集已经符合我们的要求了，接下来就是建立模型了。在建立模型之前，我先对其进行标准化，然后用主成份分析（PCA）进行了降维，最后通过逻辑回归（logistic）模型进行分类和概率预测。具体实现代码如下：

In [16]:
from __future__ import print_function

# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StandardScaler


# 将类别2和类别1合并，即Helen对男生的印象是要么有魅力要么没有魅力。
# 之所以合并，是因为pyspark.ml.classification.LogisticRegression目前仅支持二分类
feature_data = data.rdd.map(lambda x:(Vectors.dense([x[i] for i in range(0,3)]),float(1 if x[3]==2 else x[3])))
feature_data = sqlContext.createDataFrame(feature_data, ['features', 'labels'])
#feature_data.show()

train_data, test_data = feature_data.randomSplit([0.7, 0.3], 6)
#train.show()

scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures',
                            withStd=True, withMean=False)
pca = PCA(k=2, inputCol="scaledFeatures", outputCol="pcaFeatures")
lr = LogisticRegression(maxIter=10, featuresCol='pcaFeatures', labelCol='labels')


pipeline = Pipeline(stages=[scaler, pca, lr])

Model = pipeline.fit(train_data)
results = Model.transform(test_data)

results.select('probability', 'prediction', 'prediction').show(truncate=False)


+----------------------------+----------+----------+
|probability                 |prediction|prediction|
+----------------------------+----------+----------+
|[1.0,1.7947449461974263E-20]|0.0       |0.0       |
|[1.0,3.800215846574964E-25] |0.0       |0.0       |
+----------------------------+----------+----------+



In [17]:
type(results)

pyspark.sql.dataframe.DataFrame

In [20]:
print (results.show(2))

+--------------------+------+--------------------+--------------------+--------------------+--------------------+----------+
|            features|labels|      scaledFeatures|         pcaFeatures|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+--------------------+--------------------+----------+
|[26052.0,1.441871...|   1.0|[0.85677243949982...|[-0.0203392525195...|[45.4668389392915...|[1.0,1.7947449461...|       0.0|
|[38344.0,1.669788...|   1.0|[1.26101959236071...|[-0.9155042464058...|[56.2295694580017...|[1.0,3.8002158465...|       0.0|
+--------------------+------+--------------------+--------------------+--------------------+--------------------+----------+

None


In [21]:
results.select('probability', 'labels', 'prediction').show(truncate=False)

+----------------------------+------+----------+
|probability                 |labels|prediction|
+----------------------------+------+----------+
|[1.0,1.7947449461974263E-20]|1.0   |0.0       |
|[1.0,3.800215846574964E-25] |1.0   |0.0       |
+----------------------------+------+----------+



In [27]:
#最后对模型进行简单的评估
from pyspark.mllib.evaluation import MulticlassMetrics
predictionAndLabels = results.select('probability', 'labels', 'prediction').rdd.map(lambda x: (x[1], x[2]))
metrics = MulticlassMetrics(predictionAndLabels)
metrics.confusionMatrix().toArray()

array([[ 0.]])