In [13]:
from pyspark.sql import SparkSession

In [14]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()

In [15]:
spark

In [16]:
import pandas as pd
df = pd.DataFrame({
    'Name': ['Adam', 'Bosch', 'Caramen'],
    'Age': [31, 30, 29],
    'Experience': [5, 4, 3]
})

In [17]:
df.to_csv('test.csv', index=False)

# basics

In [18]:
# df.read_csv
df_pyspark = spark.read.csv('test.csv', header=True, inferSchema=True)
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|   Adam| 31|         5|
|  Bosch| 30|         4|
|Caramen| 29|         3|
+-------+---+----------+



In [19]:
# df.info()
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [20]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [21]:
print(type(df_pyspark))

<class 'pyspark.sql.dataframe.DataFrame'>


A DataFrame is a data structure that organizes data into a 2-dimensional table of rows and columns


In [22]:
# df.columns
df_pyspark.columns

['Name', 'Age', 'Experience']

In [23]:
df_pyspark.head(2)

[Row(Name='Adam', Age=31, Experience=5),
 Row(Name='Bosch', Age=30, Experience=4)]

In [24]:
# df['Name']
df_pyspark.select('Name').show()

+-------+
|   Name|
+-------+
|   Adam|
|  Bosch|
|Caramen|
+-------+



In [25]:
df_pyspark.select(['Name', 'Age']).show()

+-------+---+
|   Name|Age|
+-------+---+
|   Adam| 31|
|  Bosch| 30|
|Caramen| 29|
+-------+---+



In [26]:
# df.dtypes
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [27]:
df.describe()

Unnamed: 0,Age,Experience
count,3.0,3.0
mean,30.0,4.0
std,1.0,1.0
min,29.0,3.0
25%,29.5,3.5
50%,30.0,4.0
75%,30.5,4.5
max,31.0,5.0


In [28]:
df_pyspark.describe().show()

+-------+-------+----+----------+
|summary|   Name| Age|Experience|
+-------+-------+----+----------+
|  count|      3|   3|         3|
|   mean|   NULL|30.0|       4.0|
| stddev|   NULL| 1.0|       1.0|
|    min|   Adam|  29|         3|
|    max|Caramen|  31|         5|
+-------+-------+----+----------+



In [30]:
df_pyspark = df_pyspark.withColumn('Exp after 2 years', df_pyspark['Experience']+2)
df_pyspark.show()

+-------+---+----------+-----------------+
|   Name|Age|Experience|Exp after 2 years|
+-------+---+----------+-----------------+
|   Adam| 31|         5|                7|
|  Bosch| 30|         4|                6|
|Caramen| 29|         3|                5|
+-------+---+----------+-----------------+



In [32]:
df_pyspark = df_pyspark.drop('Exp after 2 years')
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|   Adam| 31|         5|
|  Bosch| 30|         4|
|Caramen| 29|         3|
+-------+---+----------+



In [33]:
df_pyspark.withColumnRenamed('Name', 'New Name').show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|    Adam| 31|         5|
|   Bosch| 30|         4|
| Caramen| 29|         3|
+--------+---+----------+



# handling missing values