In [1]:
import sys, os
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, Catalog
from pyspark.sql import DataFrame, DataFrameStatFunctions, DataFrameNaFunctions
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.types import Row
from subprocess import check_output

In [2]:
SPARK_DRIVER_HOST = check_output(["hostname", "-i"]).decode(encoding="utf-8").strip()
spark_conf = SparkConf()
spark_conf.setAll([
    ('spark.master', 'spark://spark:7077'),
    ('spark.app.name', 'myApp'),
    ('spark.submit.deployMode', 'client'),
    ('spark.ui.showConsoleProgress', 'true'),
    ('spark.eventLog.enabled', 'false'),
    ('spark.logConf', 'false'),
    ('spark.driver.bindAddress', '0.0.0.0'),
    ('spark.driver.host', SPARK_DRIVER_HOST),
    ('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.2.0,com.amazonaws:aws-java-sdk-bundle:1.11.704,org.apache.spark:spark-hadoop-cloud_2.12:3.3.0'),
    ("spark.hadoop.fs.s3a.endpoint", 'http://minio:9000'),
    ('spark.hadoop.fs.s3a.access.key', 'minio-root-user'),
    ('spark.hadoop.fs.s3a.secret.key', 'minio-root-password'),
    ('spark.hadoop.fs.s3a.bucket.all.committer.magic.enabled', True),
    ("spark.hadoop.fs.s3a.fast.upload", True),
    ("spark.hadoop.fs.s3a.path.style.access", True),
    ("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
])
 
spark_sess          = SparkSession.builder.config(conf=spark_conf).getOrCreate()
spark_ctxt          = spark_sess.sparkContext
spark_reader        = spark_sess.read
spark_streamReader  = spark_sess.readStream
spark_ctxt.setLogLevel("WARN")

citiesDF = spark_sess.read.option("header",True).csv('s3a://cities/cities.csv')

citiesDF.show(truncate=False)

:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
org.apache.spark#spark-hadoop-cloud_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-82b967da-0825-4a92-8edf-ce3ffd701f1c;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.2.0 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.704 in central
	found org.apache.spark#spark-hadoop-cloud_2.12;3.3.0 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.2 in central
	found org.apache.hadoop#hadoop-client-api;3.3.2 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.32 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.hadoop#hadoop-aws;3.3.2 in central
	found com.amazonaws#aws-java-sdk-bund

22/10/03 20:49:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/03 20:49:28 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

+-----+-------+-------+-----+-------+-------+-------+-----+------------------+--------+
|LatD | "LatM"| "LatS"| "NS"| "LonD"| "LonM"| "LonS"| "EW"| "City"           | "State"|
+-----+-------+-------+-----+-------+-------+-------+-----+------------------+--------+
|   41|    5  |   59  | "N" |     80|   39  |    0  | "W" | "Youngstown"     | OH     |
|   42|   52  |   48  | "N" |     97|   23  |   23  | "W" | "Yankton"        | SD     |
|   46|   35  |   59  | "N" |    120|   30  |   36  | "W" | "Yakima"         | WA     |
|   42|   16  |   12  | "N" |     71|   48  |    0  | "W" | "Worcester"      | MA     |
|   43|   37  |   48  | "N" |     89|   46  |   11  | "W" | "Wisconsin Dells"| WI     |
|   36|    5  |   59  | "N" |     80|   15  |    0  | "W" | "Winston-Salem"  | NC     |
|   49|   52  |   48  | "N" |     97|    9  |    0  | "W" | "Winnipeg"       | MB     |
|   39|   11  |   23  | "N" |     78|    9  |   36  | "W" | "Winchester"     | VA     |
|   34|   14  |   24  | "N" |   

In [3]:
citiesDF.columns

['LatD',
 ' "LatM"',
 ' "LatS"',
 ' "NS"',
 ' "LonD"',
 ' "LonM"',
 ' "LonS"',
 ' "EW"',
 ' "City"',
 ' "State"']

In [4]:
cleanCitiesDF = citiesDF \
   .withColumnRenamed(' "LatM"', 'LatM') \
   .withColumnRenamed(' "LonM"', 'LonM') \
   .select('LatM', 'LonM')
cleanCitiesDF

DataFrame[LatM: string, LonM: string]

In [8]:
cleanCitiesDF.show()

+-----+-----+
| LatM| LonM|
+-----+-----+
|    5|   39|
|   52|   23|
|   35|   30|
|   16|   48|
|   37|   46|
|    5|   15|
|   52|    9|
|   11|    9|
|   14|   55|
|   45|   33|
|    9|   37|
|   15|    0|
|   40|   16|
|   54|   29|
|   41|   20|
|    4|   43|
|   43|    3|
|   25|   19|
|   25|   23|
|   13|   20|
+-----+-----+
only showing top 20 rows



In [9]:
cleanCitiesDF.write.format("csv").option("header",  True).save("s3a://cities/cleanCities4.csv")

AnalysisException: path s3a://cities/cleanCities4.csv already exists.

In [10]:
citiesDFRecovered = spark_sess.read.option("header",True).csv('s3a://cities/cleanCities4.csv')
citiesDFRecovered.show()

+----+----+
|LatM|LonM|
+----+----+
|   5|  39|
|  52|  23|
|  35|  30|
|  16|  48|
|  37|  46|
|   5|  15|
|  52|   9|
|  11|   9|
|  14|  55|
|  45|  33|
|   9|  37|
|  15|   0|
|  40|  16|
|  54|  29|
|  41|  20|
|   4|  43|
|  43|   3|
|  25|  19|
|  25|  23|
|  13|  20|
+----+----+
only showing top 20 rows



In [11]:
Team = Row("name", "city", "stadium")
afcNorth = [Team("Bengals", "Cincinnati", "Paul Brown Stadium"),
            Team("Steelers", "Pittsburgh", "Heinz Field"),
            Team("Browns", "Cleveland", "FirstEnergy Field"),
            Team("Ravens", "Baltimore", "M&T Bank Stadium")]
afcNorthDataFrame = spark_sess.createDataFrame(afcNorth)
afcNorthDataFrame.show()

[Stage 8:>                                                          (0 + 1) / 1]

+--------+----------+------------------+
|    name|      city|           stadium|
+--------+----------+------------------+
| Bengals|Cincinnati|Paul Brown Stadium|
|Steelers|Pittsburgh|       Heinz Field|
|  Browns| Cleveland| FirstEnergy Field|
|  Ravens| Baltimore|  M&T Bank Stadium|
+--------+----------+------------------+



                                                                                

In [13]:
    from pyspark.sql import Row
    from pyspark import SparkContext as sc
    li=[1,2,3,4]
    rdd1 = sc.parallelize(li)
    row_rdd = rdd1.map(lambda x: Row(x))
    df=sqlContext.createDataFrame(row_rdd,['numbers']).show()

TypeError: parallelize() missing 1 required positional argument: 'c'

In [15]:
rdd = spark_ctxt.parallelize([(1, "one"),(2,"two")])
dffFromParalleize=rdd.toDF()
dffFromParalleize.show()

+---+---+
| _1| _2|
+---+---+
|  1|one|
|  2|two|
+---+---+



In [None]:
from pyspark.sql.types import StructType, StructField. StringType, IntergerType

schema = StructType([ \
         StructField

])

In [17]:
Song = row("name", "artist", "well_known_lyrics")
songs = [Song("Mary Had a Little Lamb","Mary Tyler","Mary had a little lamb Little lamb, little lamb Mary had a little lamb It's fleece was white as snow"), 
         Song("Somebody's Watching Me","Rockwell","I always feel like somebody's watching me And I have no privacy (oh, oh)"),
         Song("Stayin Alive", "Bee Gees", "Well, you can tell by the way I use my walk I'm a woman's man, no time to talk")]



NameError: name 'row' is not defined

In [None]:
songsDF.withColumn('stadiumWordCount', F.size(F.split(F.col('stadium'), ' '))).show()