In [4]:
!python3 --version

Python 3.8.8


In [5]:
!pip install koalas

Collecting koalas
  Downloading koalas-1.6.0-py3-none-any.whl (668 kB)
[K     |████████████████████████████████| 668 kB 9.8 MB/s eta 0:00:01
Collecting pandas<1.2.0,>=0.23.2
  Downloading pandas-1.1.5-cp38-cp38-manylinux1_x86_64.whl (9.3 MB)
[K     |████████████████████████████████| 9.3 MB 8.9 MB/s eta 0:00:01
Collecting matplotlib<3.3.0,>=3.0.0
  Downloading matplotlib-3.2.2-cp38-cp38-manylinux1_x86_64.whl (12.4 MB)
[K     |████████████████████████████████| 12.4 MB 29.4 MB/s eta 0:00:01
Installing collected packages: pandas, matplotlib, koalas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.2.2
    Uninstalling pandas-1.2.2:
      Successfully uninstalled pandas-1.2.2
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.3.4
    Uninstalling matplotlib-3.3.4:
      Successfully uninstalled matplotlib-3.3.4
Successfully installed koalas-1.6.0 matplotlib-3.2.2 pandas-1.1.5


In [6]:
!spark-submit --version

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.0.2
      /_/
                        
Using Scala version 2.12.10, OpenJDK 64-Bit Server VM, 11.0.10
Branch HEAD
Compiled by user centos on 2021-02-16T06:09:22Z
Revision 648457905c4ea7d00e3d88048c63f360045f0714
Url https://gitbox.apache.org/repos/asf/spark.git
Type --help for more information.


In [8]:
import pandas as pd
import numpy as np
import databricks.koalas as ks
from pyspark.sql import SparkSession



In [9]:
s = ks.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [11]:
kdf = ks.DataFrame(
    {'a': [1, 2, 3, 4, 5, 6],
     'b': [100, 200, 300, 400, 500, 600],
     'c': ["one", "two", "three", "four", "five", "six"]},
    index=[10, 20, 30, 40, 50, 60])

kdf

Unnamed: 0,a,b,c
10,1,100,one
20,2,200,two
30,3,300,three
40,4,400,four
50,5,500,five
60,6,600,six


In [13]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [14]:
pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
pdf

Unnamed: 0,A,B,C,D
2013-01-01,-1.104309,-1.711289,0.598256,-0.279287
2013-01-02,0.048017,1.321582,-1.33509,1.167127
2013-01-03,1.428965,0.072645,0.481861,0.165028
2013-01-04,1.637699,0.417062,1.247827,0.777462
2013-01-05,1.159207,1.405034,-0.8484,-0.280479
2013-01-06,-0.639167,-0.138744,-0.406295,0.154749


In [15]:
kdf = ks.from_pandas(pdf)
type(kdf)

databricks.koalas.frame.DataFrame

In [16]:
kdf

Unnamed: 0,A,B,C,D
2013-01-01,-1.104309,-1.711289,0.598256,-0.279287
2013-01-02,0.048017,1.321582,-1.33509,1.167127
2013-01-03,1.428965,0.072645,0.481861,0.165028
2013-01-04,1.637699,0.417062,1.247827,0.777462
2013-01-05,1.159207,1.405034,-0.8484,-0.280479
2013-01-06,-0.639167,-0.138744,-0.406295,0.154749


In [17]:
spark = SparkSession.builder.getOrCreate()

In [18]:
sdf = spark.createDataFrame(pdf)
sdf.show()

+-------------------+--------------------+-------------------+--------------------+
|                  A|                   B|                  C|                   D|
+-------------------+--------------------+-------------------+--------------------+
| -1.104309494723149| -1.7112888403044677| 0.5982560229201509|-0.27928743928904676|
|0.04801742864860694|  1.3215823948138687|-1.3350898460507208|  1.1671268687059597|
| 1.4289647857795278| 0.07264480719508562|0.48186082847274103| 0.16502790399438244|
| 1.6376985642547388|  0.4170619631452462|  1.247827040119578|   0.777462020329855|
| 1.1592068639778323|  1.4050344697039234|-0.8483998292707408|-0.28047868190814956|
|-0.6391668202524056|-0.13874359633192337|-0.4062947970903579|  0.1547488968239605|
+-------------------+--------------------+-------------------+--------------------+



In [19]:
kdf = sdf.to_koalas()
kdf

Unnamed: 0,A,B,C,D
0,-1.104309,-1.711289,0.598256,-0.279287
1,0.048017,1.321582,-1.33509,1.167127
2,1.428965,0.072645,0.481861,0.165028
3,1.637699,0.417062,1.247827,0.777462
4,1.159207,1.405034,-0.8484,-0.280479
5,-0.639167,-0.138744,-0.406295,0.154749


In [20]:
kdf.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object

In [21]:
kdf.head()

Unnamed: 0,A,B,C,D
0,-1.104309,-1.711289,0.598256,-0.279287
1,0.048017,1.321582,-1.33509,1.167127
2,1.428965,0.072645,0.481861,0.165028
3,1.637699,0.417062,1.247827,0.777462
4,1.159207,1.405034,-0.8484,-0.280479


In [22]:
kdf.index

Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [23]:
kdf.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [24]:
kdf.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.421735,0.227715,-0.04364,0.2841
std,1.151602,1.143377,0.980109,0.581413
min,-1.104309,-1.711289,-1.33509,-0.280479
25%,-0.639167,-0.138744,-0.8484,-0.279287
50%,0.048017,0.072645,-0.406295,0.154749
75%,1.428965,1.321582,0.598256,0.777462
max,1.637699,1.405034,1.247827,1.167127


In [25]:
kdf.T

Unnamed: 0,0,1,2,3,4,5
A,-1.104309,0.048017,1.428965,1.637699,1.159207,-0.639167
B,-1.711289,1.321582,0.072645,0.417062,1.405034,-0.138744
C,0.598256,-1.33509,0.481861,1.247827,-0.8484,-0.406295
D,-0.279287,1.167127,0.165028,0.777462,-0.280479,0.154749


In [26]:
kdf.sort_values(by='B')

Unnamed: 0,A,B,C,D
0,-1.104309,-1.711289,0.598256,-0.279287
5,-0.639167,-0.138744,-0.406295,0.154749
2,1.428965,0.072645,0.481861,0.165028
3,1.637699,0.417062,1.247827,0.777462
1,0.048017,1.321582,-1.33509,1.167127
4,1.159207,1.405034,-0.8484,-0.280479


In [27]:
sdf.rdd.getNumPartitions()

8

In [29]:
#kdf.rdd.getNumPartitions()