In [15]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer

In [16]:
spark=SparkSession.builder.appName("lRProject").getOrCreate()

In [17]:
df=spark.read.csv("cruise_ship_info.csv",inferSchema=True,header=True)

In [18]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [19]:
indexer = StringIndexer(inputCol="Cruise_line", outputCol="Cruise_line_index")

In [20]:
df_mod=indexer.fit(df).transform(df)

In [21]:
df_mod.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)
 |-- Cruise_line_index: double (nullable = true)



In [22]:
df_mod.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Cruise_line_index']

feature_list=['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'Cruise_line_index']

In [28]:
assembler=VectorAssembler(inputCols=feature_list,outputCol="feature_index")
df_features=assembler.transform(df_mod)["feature_index","crew"]

In [31]:
df_features.printSchema()

root
 |-- feature_index: vector (nullable = true)
 |-- crew: double (nullable = true)



In [33]:
train_df,test_df=df_features.randomSplit([0.7,0.3])

In [35]:
lr=LinearRegression(featuresCol="feature_index",labelCol="crew")
model=lr.fit(train_df)
test_result=model.evaluate(test_df)

In [37]:
test_result.rootMeanSquaredError

0.7017776063687354

In [38]:
test_result.r2

0.9499336846297265

In [39]:
model.coefficients

DenseVector([-0.0087, 0.0089, -0.1375, 0.4384, 0.8266, -0.0029, 0.0496])

In [40]:
model.intercept

-1.1080921204901595