In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Basics").getOrCreate()

23/02/10 21:23:12 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [37]:
df = spark.read.csv("data/nces330_20.csv")

In [38]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)



In [39]:
df.columns

['_c0', '_c1', '_c2', '_c3', '_c4', '_c5']

In [40]:
df.show()

+----+-------+-------------------+------+------------+-----+
| _c0|    _c1|                _c2|   _c3|         _c4|  _c5|
+----+-------+-------------------+------+------------+-----+
|Year|  State|               Type|Length|     Expense|Value|
|2013|Alabama|            Private|4-year|Fees/Tuition|13983|
|2013|Alabama|            Private|4-year|  Room/Board| 8503|
|2013|Alabama|    Public In-State|2-year|Fees/Tuition| 4048|
|2013|Alabama|    Public In-State|4-year|Fees/Tuition| 8073|
|2013|Alabama|    Public In-State|4-year|  Room/Board| 8473|
|2013|Alabama|Public Out-of-State|2-year|Fees/Tuition| 7736|
|2013|Alabama|Public Out-of-State|4-year|Fees/Tuition|20380|
|2013|Alabama|Public Out-of-State|4-year|  Room/Board| 8473|
|2013| Alaska|            Private|4-year|Fees/Tuition|21496|
|2013| Alaska|            Private|4-year|  Room/Board| 8923|
|2013| Alaska|    Public In-State|2-year|Fees/Tuition| 3972|
|2013| Alaska|    Public In-State|4-year|Fees/Tuition| 6317|
|2013| Alaska|    Public

In [41]:
df.describe()

DataFrame[summary: string, _c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string]

In [42]:
from pyspark.sql.types import (StructField, StringType, IntegerType, StructType, FloatType, LongType)

In [43]:
data_schema = [StructField('Year', IntegerType(), True), StructField('State', StringType(), True), 
               StructField('Type', StringType(), True), StructField('Lenght', StringType(), True),
               StructField('Expense', StringType(), True), StructField('Value', IntegerType(), True)
              ]

In [44]:
final_struct = StructType(fields=data_schema)

In [45]:
df = spark.read.csv("data/nces330_20.csv", schema=final_struct)
df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- State: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Lenght: string (nullable = true)
 |-- Expense: string (nullable = true)
 |-- Value: integer (nullable = true)



In [46]:
type(df["Year"])

pyspark.sql.column.Column

In [47]:
type(df.select("Year"))


pyspark.sql.dataframe.DataFrame

In [48]:
print(df.head(2)[0])

Row(Year=None, State='State', Type='Type', Lenght='Length', Expense='Expense', Value=None)


In [49]:
df.select(['Year', 'Value'])

DataFrame[Year: int, Value: int]

In [50]:
df.withColumn('Price UAH', df['Value']*40.2).show()

+----+-------+-------------------+------+------------+-----+------------------+
|Year|  State|               Type|Lenght|     Expense|Value|         Price UAH|
+----+-------+-------------------+------+------------+-----+------------------+
|null|  State|               Type|Length|     Expense| null|              null|
|2013|Alabama|            Private|4-year|Fees/Tuition|13983| 562116.6000000001|
|2013|Alabama|            Private|4-year|  Room/Board| 8503|341820.60000000003|
|2013|Alabama|    Public In-State|2-year|Fees/Tuition| 4048|          162729.6|
|2013|Alabama|    Public In-State|4-year|Fees/Tuition| 8073|324534.60000000003|
|2013|Alabama|    Public In-State|4-year|  Room/Board| 8473|340614.60000000003|
|2013|Alabama|Public Out-of-State|2-year|Fees/Tuition| 7736|          310987.2|
|2013|Alabama|Public Out-of-State|4-year|Fees/Tuition|20380|          819276.0|
|2013|Alabama|Public Out-of-State|4-year|  Room/Board| 8473|340614.60000000003|
|2013| Alaska|            Private|4-year

In [54]:
# register DF as a temporary SQL VIEW
df.createOrReplaceTempView("uni")
results = spark.sql("select * from uni")
results.show()

+----+-------+-------------------+------+------------+-----+
|Year|  State|               Type|Lenght|     Expense|Value|
+----+-------+-------------------+------+------------+-----+
|null|  State|               Type|Length|     Expense| null|
|2013|Alabama|            Private|4-year|Fees/Tuition|13983|
|2013|Alabama|            Private|4-year|  Room/Board| 8503|
|2013|Alabama|    Public In-State|2-year|Fees/Tuition| 4048|
|2013|Alabama|    Public In-State|4-year|Fees/Tuition| 8073|
|2013|Alabama|    Public In-State|4-year|  Room/Board| 8473|
|2013|Alabama|Public Out-of-State|2-year|Fees/Tuition| 7736|
|2013|Alabama|Public Out-of-State|4-year|Fees/Tuition|20380|
|2013|Alabama|Public Out-of-State|4-year|  Room/Board| 8473|
|2013| Alaska|            Private|4-year|Fees/Tuition|21496|
|2013| Alaska|            Private|4-year|  Room/Board| 8923|
|2013| Alaska|    Public In-State|2-year|Fees/Tuition| 3972|
|2013| Alaska|    Public In-State|4-year|Fees/Tuition| 6317|
|2013| Alaska|    Public

In [55]:
results = spark.sql("select * from uni where Value>20000 and State='Alaska'")
results.show()


+----+------+-------------------+------+------------+-----+
|Year| State|               Type|Lenght|     Expense|Value|
+----+------+-------------------+------+------------+-----+
|2013|Alaska|            Private|4-year|Fees/Tuition|21496|
|2014|Alaska|            Private|4-year|Fees/Tuition|20943|
|2016|Alaska|Public Out-of-State|4-year|Fees/Tuition|20463|
|2017|Alaska|Public Out-of-State|4-year|Fees/Tuition|21431|
|2018|Alaska|Public Out-of-State|4-year|Fees/Tuition|21284|
|2019|Alaska|Public Out-of-State|4-year|Fees/Tuition|24454|
|2020|Alaska|Public Out-of-State|4-year|Fees/Tuition|26767|
|2021|Alaska|Public Out-of-State|4-year|Fees/Tuition|25535|
+----+------+-------------------+------+------------+-----+



In [64]:
df.filter("Value > 20000").select(['Year', 'State', 'Value']).show()

+----+--------------------+-----+
|Year|               State|Value|
+----+--------------------+-----+
|2013|             Alabama|20380|
|2013|              Alaska|21496|
|2013|             Arizona|21201|
|2013|          California|28345|
|2013|          California|30765|
|2013|            Colorado|25470|
|2013|         Connecticut|35336|
|2013|         Connecticut|26688|
|2013|            Delaware|26228|
|2013|District of Columbia|35524|
|2013|             Florida|20155|
|2013|             Georgia|22456|
|2013|             Georgia|22393|
|2013|              Hawaii|23614|
|2013|            Illinois|26299|
|2013|            Illinois|26873|
|2013|             Indiana|26794|
|2013|             Indiana|26538|
|2013|                Iowa|23019|
|2013|              Kansas|20852|
+----+--------------------+-----+
only showing top 20 rows



In [69]:
df.filter((df["Value"] > 20000) & ~(df["State"] == "Alaska")).select(['Year', 'State', 'Value']).show()

+----+--------------------+-----+
|Year|               State|Value|
+----+--------------------+-----+
|2013|             Alabama|20380|
|2013|             Arizona|21201|
|2013|          California|28345|
|2013|          California|30765|
|2013|            Colorado|25470|
|2013|         Connecticut|35336|
|2013|         Connecticut|26688|
|2013|            Delaware|26228|
|2013|District of Columbia|35524|
|2013|             Florida|20155|
|2013|             Georgia|22456|
|2013|             Georgia|22393|
|2013|              Hawaii|23614|
|2013|            Illinois|26299|
|2013|            Illinois|26873|
|2013|             Indiana|26794|
|2013|             Indiana|26538|
|2013|                Iowa|23019|
|2013|              Kansas|20852|
|2013|            Kentucky|20639|
+----+--------------------+-----+
only showing top 20 rows



In [71]:
df.groupBy("State")

<pyspark.sql.group.GroupedData at 0x1090e7810>

In [73]:
df.groupBy("State").mean("Value").show()

+--------------------+------------------+
|               State|        avg(Value)|
+--------------------+------------------+
|                Utah| 8918.549295774648|
|              Hawaii| 12524.30985915493|
|           Minnesota|12642.295774647888|
|                Ohio|13620.802816901409|
|            Arkansas| 10233.81690140845|
|              Oregon| 15533.43661971831|
|               Texas| 12596.38028169014|
|        North Dakota|  9227.74647887324|
|        Pennsylvania|16368.957746478873|
|         Connecticut|17435.211267605635|
|            Nebraska| 10701.81690140845|
|             Vermont| 18596.49295774648|
|              Nevada|           12710.6|
|               State|              null|
|          Washington|14730.028169014084|
|            Illinois|15167.633802816901|
|            Oklahoma|11215.225352112677|
|District of Columbia|           18880.2|
|            Delaware|13834.622950819672|
|              Alaska|11629.174603174602|
+--------------------+------------

In [80]:
df.agg({"Value": "max"}).show()

+----------+
|max(Value)|
+----------+
|     49152|
+----------+



In [104]:
from pyspark.sql.functions import countDistinct, avg, stddev, mean
df.select(countDistinct("State").alias('State Name')).show()

+----------+
|State Name|
+----------+
|        52|
+----------+



In [87]:
df.na.drop(thresh=10)
df.na.drop(how="any") # default: any missing values are dropped 
df.na.drop(subset=['State']) # concider any of this subset to be missing and drop it. 
df.na.fill("FILL VALUE")

DataFrame[Year: int, State: string, Type: string, Lenght: string, Expense: string, Value: int]

In [102]:
m_value = df.select(avg(df["Value"])).collect()
m_value

[Row(avg(Value)=13027.72012401353)]

In [105]:
m_value = df.select(mean("Value")).collect()
m_value

[Row(avg(Value)=13027.72012401353)]

In [108]:
data_schema = [StructField('Year', IntegerType(), True), StructField('State', StringType(), True), 
               StructField('Type', StringType(), True), StructField('Lenght', StringType(), True),
               StructField('Expense', StringType(), True), StructField('Value', IntegerType(), True)
              ]
time_df = spark.read.csv("data/gemini_BTCUSD_2020_1min.csv")
time_df.printSchema()


root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)

