In [1]:
from pyspark.sql import SparkSession

In [11]:
from pyspark.sql.functions import countDistinct

In [2]:
spark = SparkSession.builder.appName('dupa').getOrCreate()

In [16]:
data = spark.read.csv(r"C:\Users\Asus\Desktop\studia big data\Python-and-Spark-for-Big-Data-master\Spark_for_Machine_Learning\Logistic_Regression\titanic.csv", inferSchema=True, header= True)

In [28]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

In [40]:
data = data.na.drop()

In [41]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [42]:
data.printSchema

<bound method DataFrame.printSchema of DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]>

In [43]:
data.head(5)

[Row(PassengerId=2, Survived=1, Pclass=1, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age=38.0, SibSp=1, Parch=0, Ticket='PC 17599', Fare=71.2833, Cabin='C85', Embarked='C'),
 Row(PassengerId=4, Survived=1, Pclass=1, Name='Futrelle, Mrs. Jacques Heath (Lily May Peel)', Sex='female', Age=35.0, SibSp=1, Parch=0, Ticket='113803', Fare=53.1, Cabin='C123', Embarked='S'),
 Row(PassengerId=7, Survived=0, Pclass=1, Name='McCarthy, Mr. Timothy J', Sex='male', Age=54.0, SibSp=0, Parch=0, Ticket='17463', Fare=51.8625, Cabin='E46', Embarked='S'),
 Row(PassengerId=11, Survived=1, Pclass=3, Name='Sandstrom, Miss. Marguerite Rut', Sex='female', Age=4.0, SibSp=1, Parch=1, Ticket='PP 9549', Fare=16.7, Cabin='G6', Embarked='S'),
 Row(PassengerId=12, Survived=1, Pclass=1, Name='Bonnell, Miss. Elizabeth', Sex='female', Age=58.0, SibSp=0, Parch=0, Ticket='113783', Fare=26.55, Cabin='C103', Embarked='S')]

In [44]:
sex_ind = StringIndexer(inputCol='Sex', outputCol='sex_ind')
embarked_ind = StringIndexer(inputCol='Embarked', outputCol='embarked_ind')

In [45]:
out_sex = OneHotEncoder(inputCol='sex_ind', outputCol='sex_enc')
out_embarked = OneHotEncoder(inputCol='embarked_ind', outputCol='embarked_enc')

In [46]:
assembler = VectorAssembler(inputCols=['Pclass', 'sex_enc', 'embarked_enc', 'Age', 'SibSp', 'Parch', 'Fare'], outputCol='features')

In [47]:
from pyspark.ml import Pipeline

In [48]:
from pyspark.ml.classification import LogisticRegression

In [49]:
lr = LogisticRegression(labelCol='Survived', featuresCol='features')

In [50]:
pipeline = Pipeline(stages=[sex_ind, embarked_ind, out_sex, out_embarked, assembler, lr])

In [53]:
train_data, test_data = data.randomSplit([0.7, 0.3])

In [54]:
model = pipeline.fit(train_data)

In [55]:
model_test = model.transform(test_data)

In [58]:
model_test.select('prediction', 'Survived').show()

+----------+--------+
|prediction|Survived|
+----------+--------+
|       1.0|       1|
|       0.0|       0|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       0.0|       0|
|       0.0|       0|
|       1.0|       0|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       0|
|       0.0|       0|
|       0.0|       0|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       1|
|       1.0|       0|
+----------+--------+
only showing top 20 rows



In [60]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [61]:
pred = BinaryClassificationEvaluator(labelCol='Survived', rawPredictionCol='prediction')

In [62]:
auc = pred.evaluate(model_test)

In [64]:
data.head()

Row(PassengerId=2, Survived=1, Pclass=1, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age=38.0, SibSp=1, Parch=0, Ticket='PC 17599', Fare=71.2833, Cabin='C85', Embarked='C')