In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder\
                    .appName('Inferred and explict schemas')\
                    .getOrCreate()

In [3]:
from pyspark.sql import Row

In [4]:
lines = sc.textFile('/Users/nli/dev/spark/datasets/students.txt')

In [5]:
lines.collect()

['Emily, 44, 55, 78',
 'Andy, 47, 34, 89',
 'Rick, 55, 78, 55',
 'Aaron, 66, 34, 98']

In [6]:
parts = lines.map(lambda l: l.split(','))
parts.collect()

[['Emily', ' 44', ' 55', ' 78'],
 ['Andy', ' 47', ' 34', ' 89'],
 ['Rick', ' 55', ' 78', ' 55'],
 ['Aaron', ' 66', ' 34', ' 98']]

In [9]:
students= parts.map(lambda p: Row(name=p[0], math=int(p[1]), english=int(p[2]), science=int(p[3])))

In [10]:
students.collect()

[Row(english=55, math=44, name='Emily', science=78),
 Row(english=34, math=47, name='Andy', science=89),
 Row(english=78, math=55, name='Rick', science=55),
 Row(english=34, math=66, name='Aaron', science=98)]

In [11]:
schemaStudents = spark.createDataFrame(students)

In [12]:
schemaStudents.createOrReplaceTempView('students')

In [13]:
schemaStudents.columns

['english', 'math', 'name', 'science']

In [12]:
schemaStudents.schema

StructType(List(StructField(english,LongType,true),StructField(math,LongType,true),StructField(name,StringType,true),StructField(science,LongType,true)))

In [13]:
spark.sql('SELECT * FROM students').show()

+-------+----+-----+-------+
|english|math| name|science|
+-------+----+-----+-------+
|     55|  44|Emily|     78|
|     34|  47| Andy|     89|
|     78|  55| Rick|     55|
|     34|  66|Aaron|     98|
+-------+----+-----+-------+



In [14]:
parts.collect()

[['Emily', ' 44', ' 55', ' 78'],
 ['Andy', ' 47', ' 34', ' 89'],
 ['Rick', ' 55', ' 78', ' 55'],
 ['Aaron', ' 66', ' 34', ' 98']]

In [16]:
parts_typed = parts.map(lambda p:Row(name=p[0], math=int(p[1]), english=int(p[2]), science=int(p[3])))

In [17]:
schemeString = 'name(String) math(int) english(int) science(int)'

In [22]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

fields = [StructField('name', StringType(), True),
         StructField('math', IntegerType(), True),
         StructField('english', IntegerType(), True),
         StructField('science', IntegerType(), True)]

In [23]:
schema = StructType(fields)

In [24]:
schemaStudents.columns

['english', 'math', 'name', 'science']

In [28]:
schemaStudents.schema

StructType(List(StructField(english,LongType,true),StructField(math,LongType,true),StructField(name,StringType,true),StructField(science,LongType,true)))

In [29]:
schemaStudents.createOrReplaceTempView('students_explicit')

In [30]:
spark.sql('SELECT * FROM students_explicit')

DataFrame[english: bigint, math: bigint, name: string, science: bigint]

In [31]:
spark.sql('SELECT * FROM students_explicit').show()

+-------+----+-----+-------+
|english|math| name|science|
+-------+----+-----+-------+
|     55|  44|Emily|     78|
|     34|  47| Andy|     89|
|     78|  55| Rick|     55|
|     34|  66|Aaron|     98|
+-------+----+-----+-------+

