In [1]:
#!pip install pyspark==2.4.4

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('PySpark').getOrCreate()

In [4]:
spark

In [5]:
# если не локальный спарк:
# spark = SparkSession.master('spark://207.234.43.54:7077').builder.appName('PySpark').getOrCreate()

In [6]:
sc = spark.sparkContext

In [7]:
sc

In [8]:
data = range(0, 1000)

In [9]:
rdd = sc.parallelize(data)

In [10]:
type(rdd)

pyspark.rdd.PipelinedRDD

In [11]:
list = rdd.collect()

In [12]:
type(list)

list

In [13]:
rdd.filter(lambda x: x ** 2 > 1000).map(lambda x: x + 100)

PythonRDD[2] at RDD at PythonRDD.scala:53

In [14]:
rdd.filter(lambda x: x ** 2 > 1000).map(lambda x: x + 100).sum()

595804

In [15]:
rdd.union(rdd).count()

2000

In [16]:
rdd.saveAsTextFile('_export_data\data')

In [17]:
rdd.coalesce(1).saveAsTextFile('_export_data\data1')

In [18]:
sc.defaultParallelism

8

### RDD to DataFrame

In [19]:
persons = [(1, 'Alex'), (2, 'Anna'), (3, 'Denis'), (4, 'Alex')]

In [20]:
rdd = sc.parallelize(persons)

In [21]:
df = rdd.toDF(['id', 'name'])

In [22]:
type(df)

pyspark.sql.dataframe.DataFrame

In [23]:
df.show()

+---+-----+
| id| name|
+---+-----+
|  1| Alex|
|  2| Anna|
|  3|Denis|
|  4| Alex|
+---+-----+



In [24]:
type(df.rdd)

pyspark.rdd.RDD

In [25]:
df.schema

StructType(List(StructField(id,LongType,true),StructField(name,StringType,true)))

In [26]:
df.columns

['id', 'name']

In [27]:
df.dtypes

[('id', 'bigint'), ('name', 'string')]

In [28]:
df.describe().show()

+-------+------------------+-----+
|summary|                id| name|
+-------+------------------+-----+
|  count|                 4|    4|
|   mean|               2.5| null|
| stddev|1.2909944487358056| null|
|    min|                 1| Alex|
|    max|                 4|Denis|
+-------+------------------+-----+



In [29]:
df.describe(['id']).show()

+-------+------------------+
|summary|                id|
+-------+------------------+
|  count|                 4|
|   mean|               2.5|
| stddev|1.2909944487358056|
|    min|                 1|
|    max|                 4|
+-------+------------------+



In [30]:
df[['name']].show()

+-----+
| name|
+-----+
| Alex|
| Anna|
|Denis|
| Alex|
+-----+



In [31]:
df.groupBy('name').count().show()

+-----+-----+
| name|count|
+-----+-----+
| Alex|    2|
| Anna|    1|
|Denis|    1|
+-----+-----+



### Добавление колонки

In [32]:
from pyspark.sql.functions import split, col

In [33]:
ndf = df.withColumn('name_alias', split(col('name'), '')[0])

In [34]:
ndf.show()

+---+-----+----------+
| id| name|name_alias|
+---+-----+----------+
|  1| Alex|         A|
|  2| Anna|         A|
|  3|Denis|         D|
|  4| Alex|         A|
+---+-----+----------+



In [35]:
ndf.drop('name_alias').show()

+---+-----+
| id| name|
+---+-----+
|  1| Alex|
|  2| Anna|
|  3|Denis|
|  4| Alex|
+---+-----+



In [36]:
ndf.show()

+---+-----+----------+
| id| name|name_alias|
+---+-----+----------+
|  1| Alex|         A|
|  2| Anna|         A|
|  3|Denis|         D|
|  4| Alex|         A|
+---+-----+----------+



т.е. операции над иммутабельными структурами, которые не изменяются без переприсваивания

In [37]:
ndf = df.withColumn('age', col('id')*10)

In [38]:
ndf.show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1| Alex| 10|
|  2| Anna| 20|
|  3|Denis| 30|
|  4| Alex| 40|
+---+-----+---+



In [39]:
ndf.corr('age', 'id')

1.0

In [40]:
ndf[['age', 'id']].collect()

[Row(age=10, id=1), Row(age=20, id=2), Row(age=30, id=3), Row(age=40, id=4)]

In [41]:
ndf[['age', 'id']].collect()[0].id

1

### Сохранение и загрузка

In [42]:
df.coalesce(1).write.csv('_export_data\persons_csv')

In [43]:
df.coalesce(1).write.json('_export_data\persons_json')

In [44]:
df.coalesce(1).write.parquet('_export_data\persons_parquet')

In [45]:
spark.read.parquet('_export_data\persons_parquet').show()

+---+-----+
| id| name|
+---+-----+
|  1| Alex|
|  2| Anna|
|  3|Denis|
|  4| Alex|
+---+-----+



### Spark SQL

In [46]:
df.show()

+---+-----+
| id| name|
+---+-----+
|  1| Alex|
|  2| Anna|
|  3|Denis|
|  4| Alex|
+---+-----+



In [47]:
df.createOrReplaceTempView('people')

In [48]:
spark.sql('select name from people').show()

+-----+
| name|
+-----+
| Alex|
| Anna|
|Denis|
| Alex|
+-----+



In [49]:
spark.sql('select name from people where id <=2').show()

+----+
|name|
+----+
|Alex|
|Anna|
+----+



### PySpark & Pandas

In [50]:
pdf = df.toPandas()

In [51]:
type(pdf)

pandas.core.frame.DataFrame

In [52]:
pdf

Unnamed: 0,id,name
0,1,Alex
1,2,Anna
2,3,Denis
3,4,Alex


In [53]:
df_from_pandas = spark.createDataFrame(pdf)

In [54]:
type(df_from_pandas)

pyspark.sql.dataframe.DataFrame

In [55]:
df_from_pandas.show()

+---+-----+
| id| name|
+---+-----+
|  1| Alex|
|  2| Anna|
|  3|Denis|
|  4| Alex|
+---+-----+

