## PySpakr TitanicJob

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, avg

In [2]:
spark = SparkSession.builder.appName('PySparkTitanicJob').getOrCreate()

In [3]:
spark

In [4]:
df = spark.read.option('header', 'true').csv('_data/train.csv')

In [5]:
df.dtypes

[('PassengerId', 'string'),
 ('Survived', 'string'),
 ('Pclass', 'string'),
 ('Name', 'string'),
 ('Sex', 'string'),
 ('Age', 'string'),
 ('SibSp', 'string'),
 ('Parch', 'string'),
 ('Ticket', 'string'),
 ('Fare', 'string'),
 ('Cabin', 'string'),
 ('Embarked', 'string')]

In [6]:
df.count()

891

In [7]:
df.describe(['Sex', 'Age']).show()

+-------+------+------------------+
|summary|   Sex|               Age|
+-------+------+------------------+
|  count|   891|               714|
|   mean|  null| 29.69911764705882|
| stddev|  null|14.526497332334035|
|    min|female|              0.42|
|    max|  male|                 9|
+-------+------+------------------+



In [8]:
df = df.drop('Ticket', 'Name', 'Fare', 'Cabin')

In [9]:
df.show()

+-----------+--------+------+------+----+-----+-----+--------+
|PassengerId|Survived|Pclass|   Sex| Age|SibSp|Parch|Embarked|
+-----------+--------+------+------+----+-----+-----+--------+
|          1|       0|     3|  male|  22|    1|    0|       S|
|          2|       1|     1|female|  38|    1|    0|       C|
|          3|       1|     3|female|  26|    0|    0|       S|
|          4|       1|     1|female|  35|    1|    0|       S|
|          5|       0|     3|  male|  35|    0|    0|       S|
|          6|       0|     3|  male|null|    0|    0|       Q|
|          7|       0|     1|  male|  54|    0|    0|       S|
|          8|       0|     3|  male|   2|    3|    1|       S|
|          9|       1|     3|female|  27|    0|    2|       S|
|         10|       1|     2|female|  14|    1|    0|       C|
|         11|       1|     3|female|   4|    1|    1|       S|
|         12|       1|     1|female|  58|    0|    0|       S|
|         13|       0|     3|  male|  20|    0|    0|  

In [10]:
df = df.withColumn('FamilySize', col('SibSp') + col('Parch') + 1)

In [11]:
df.show()

+-----------+--------+------+------+----+-----+-----+--------+----------+
|PassengerId|Survived|Pclass|   Sex| Age|SibSp|Parch|Embarked|FamilySize|
+-----------+--------+------+------+----+-----+-----+--------+----------+
|          1|       0|     3|  male|  22|    1|    0|       S|       2.0|
|          2|       1|     1|female|  38|    1|    0|       C|       2.0|
|          3|       1|     3|female|  26|    0|    0|       S|       1.0|
|          4|       1|     1|female|  35|    1|    0|       S|       2.0|
|          5|       0|     3|  male|  35|    0|    0|       S|       1.0|
|          6|       0|     3|  male|null|    0|    0|       Q|       1.0|
|          7|       0|     1|  male|  54|    0|    0|       S|       1.0|
|          8|       0|     3|  male|   2|    3|    1|       S|       5.0|
|          9|       1|     3|female|  27|    0|    2|       S|       3.0|
|         10|       1|     2|female|  14|    1|    0|       C|       2.0|
|         11|       1|     3|female|  

In [12]:
df.where(col('Age').isNull()).count()

177

In [13]:
df.select(avg(col('Age'))).collect()

[Row(avg(Age)=29.69911764705882)]

In [14]:
avg_age = df.select(avg(col('Age'))).collect()[0][0]
avg_age

29.69911764705882

In [15]:
ndf = df.fillna({'Age': avg_age})

In [16]:
ndf.show(5)

+-----------+--------+------+------+---+-----+-----+--------+----------+
|PassengerId|Survived|Pclass|   Sex|Age|SibSp|Parch|Embarked|FamilySize|
+-----------+--------+------+------+---+-----+-----+--------+----------+
|          1|       0|     3|  male| 22|    1|    0|       S|       2.0|
|          2|       1|     1|female| 38|    1|    0|       C|       2.0|
|          3|       1|     3|female| 26|    0|    0|       S|       1.0|
|          4|       1|     1|female| 35|    1|    0|       S|       2.0|
|          5|       0|     3|  male| 35|    0|    0|       S|       1.0|
+-----------+--------+------+------+---+-----+-----+--------+----------+
only showing top 5 rows



In [17]:
ndf[['Sex']].distinct().show()

+------+
|   Sex|
+------+
|female|
|  male|
+------+



In [18]:
ndf = ndf.withColumn('Male', col('Sex') == 'male')
ndf = ndf.withColumn('Female', col('Sex') == 'female')

In [19]:
ndf.show(5)

+-----------+--------+------+------+---+-----+-----+--------+----------+-----+------+
|PassengerId|Survived|Pclass|   Sex|Age|SibSp|Parch|Embarked|FamilySize| Male|Female|
+-----------+--------+------+------+---+-----+-----+--------+----------+-----+------+
|          1|       0|     3|  male| 22|    1|    0|       S|       2.0| true| false|
|          2|       1|     1|female| 38|    1|    0|       C|       2.0|false|  true|
|          3|       1|     3|female| 26|    0|    0|       S|       1.0|false|  true|
|          4|       1|     1|female| 35|    1|    0|       S|       2.0|false|  true|
|          5|       0|     3|  male| 35|    0|    0|       S|       1.0| true| false|
+-----------+--------+------+------+---+-----+-----+--------+----------+-----+------+
only showing top 5 rows



In [20]:
ndf.drop('Sex')

DataFrame[PassengerId: string, Survived: string, Pclass: string, Age: string, SibSp: string, Parch: string, Embarked: string, FamilySize: double, Male: boolean, Female: boolean]

In [21]:
ndf.show(5)

+-----------+--------+------+------+---+-----+-----+--------+----------+-----+------+
|PassengerId|Survived|Pclass|   Sex|Age|SibSp|Parch|Embarked|FamilySize| Male|Female|
+-----------+--------+------+------+---+-----+-----+--------+----------+-----+------+
|          1|       0|     3|  male| 22|    1|    0|       S|       2.0| true| false|
|          2|       1|     1|female| 38|    1|    0|       C|       2.0|false|  true|
|          3|       1|     3|female| 26|    0|    0|       S|       1.0|false|  true|
|          4|       1|     1|female| 35|    1|    0|       S|       2.0|false|  true|
|          5|       0|     3|  male| 35|    0|    0|       S|       1.0| true| false|
+-----------+--------+------+------+---+-----+-----+--------+----------+-----+------+
only showing top 5 rows



In [22]:
ndf.coalesce(1).write.option('header', 'true').csv('_export_data_2/clean_data')