In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [2]:
spark = SparkSession.builder.appName('homework').getOrCreate()

In [5]:
datas = spark.read.csv('cruise_ship_info.csv', inferSchema=True, header=True)

In [6]:
datas.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [7]:
datas.head(1)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)]

从上面的数据类型，简单的分离出，变量列和预测列。 

变量：Age, Tonnage, passengers, length, cabins, passenger_density 和 Cruise_line（需要做转换)

预测 crew

In [8]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler, StringIndexer

从客户的反馈中得知 `Cruise_line` 对于船员数量影响比较大，那么就需要把这个字段加入到变量中，通过类似枚举的方式转换出新的一列加入到原来的数据集中

In [9]:
cruise_indexer = StringIndexer(inputCol='Cruise_line', outputCol="curise_index")

In [10]:
cruise_indexer_model = cruise_indexer.fit(datas)

In [11]:
new_datas = cruise_indexer_model.transform(datas)

In [12]:
new_datas.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)
 |-- curise_index: double (nullable = true)



In [13]:
new_datas.head(1)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, curise_index=16.0)]

In [14]:
new_datas.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|curise_index|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|        16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|        16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|         1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|         1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|         1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|         1.0|
|    Elati

下面进行数据规整

In [15]:
assember = VectorAssembler(inputCols=['curise_index',
                                     'Age',
                                     'Tonnage',
                                     'passengers',
                                     'length',
                                     'cabins',
                                     'passenger_density'],
                          outputCol="features")

In [16]:
output = assember.transform(new_datas)

In [17]:
output.head(1)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, curise_index=16.0, features=DenseVector([16.0, 6.0, 30.277, 6.94, 5.94, 3.55, 42.64]))]

In [18]:
final_datas = output.select(['features', 'crew'])

In [19]:
final_datas.show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[16.0,6.0,30.2769...|3.55|
|[16.0,6.0,30.2769...|3.55|
|[1.0,26.0,47.262,...| 6.7|
|[1.0,11.0,110.0,2...|19.1|
|[1.0,17.0,101.353...|10.0|
|[1.0,22.0,70.367,...| 9.2|
|[1.0,15.0,70.367,...| 9.2|
|[1.0,23.0,70.367,...| 9.2|
|[1.0,19.0,70.367,...| 9.2|
|[1.0,6.0,110.2389...|11.5|
|[1.0,10.0,110.0,2...|11.6|
|[1.0,28.0,46.052,...| 6.6|
|[1.0,18.0,70.367,...| 9.2|
|[1.0,17.0,70.367,...| 9.2|
|[1.0,11.0,86.0,21...| 9.3|
|[1.0,8.0,110.0,29...|11.6|
|[1.0,9.0,88.5,21....|10.3|
|[1.0,15.0,70.367,...| 9.2|
|[1.0,12.0,88.5,21...| 9.3|
|[1.0,20.0,70.367,...| 9.2|
+--------------------+----+
only showing top 20 rows



In [20]:
train_data, test_data = final_datas.randomSplit([0.7, 0.3])

In [22]:
train_data.count()

114

In [23]:
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               114|
|   mean|7.6291228070175485|
| stddev|3.6090386927366014|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+



In [24]:
lr = LinearRegression(labelCol='crew', featuresCol='features')

In [25]:
lr_model = lr.fit(train_data)

In [26]:
test_resluts = lr_model.evaluate(test_data)

In [27]:
test_resluts.r2

0.9152372763534818

In [29]:
test_resluts.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|-0.13331326195602067|
| -1.0191295271036989|
|  0.9719881132749766|
| 0.49474877473085677|
|  0.9339734508391526|
| -1.0173541365609502|
| 0.38150880549111577|
|  0.6111098676607192|
| -0.6988318018289572|
|  0.7330568332822125|
|  0.7385464524663146|
|  0.5562112044334642|
| -0.6573922663653793|
|   1.109198206749074|
|-0.06916058301660044|
|-0.06328635133681715|
|-0.04586535624803112|
| 0.27464976160629107|
|  0.3169459161037089|
| 0.18229117371450165|
+--------------------+
only showing top 20 rows



看起来模型还是比较准的