In [1]:
import pyspark

# Dataframe

## Part 1
    - Create Session
    - Read from CSV
    - Show Schema from DF

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [27]:
#read dataset
df = spark.read.option("delimiter",";").option("header","true").csv('example.csv')

In [12]:
df.show()

+-----+----+
| Nama|Umur|
+-----+----+
|Hanif|  22|
|Ahmad|  18|
| Dudu|  23|
| Bubu|  23|
+-----+----+



In [13]:
type(df)

pyspark.sql.dataframe.DataFrame

In [14]:
df.head()

Row(Nama='Hanif', Umur='22')

In [15]:
df.printSchema()

root
 |-- Nama: string (nullable = true)
 |-- Umur: string (nullable = true)



## Part 2
    - Describe
    - Add and Drop Columns
    - Rename Column

In [23]:
#read the dataset with custom format columns
df1 = spark.read.option('delimiter',';').option('header','true').csv('example1.csv',inferSchema=True)

In [24]:
df1.show()

+-----+----+----------+
| Nama|Umur|Pengalaman|
+-----+----+----------+
|Hanif|  23|         2|
| Dika|  30|         8|
| Tino|  32|         5|
| Muiz|  28|         7|
+-----+----+----------+



In [25]:
df1.printSchema()

root
 |-- Nama: string (nullable = true)
 |-- Umur: integer (nullable = true)
 |-- Pengalaman: integer (nullable = true)



In [29]:
df1=spark.read.option('delimiter',';').csv('example1.csv',header=True,inferSchema=True)

In [32]:
df1.printSchema()

root
 |-- Nama: string (nullable = true)
 |-- Umur: integer (nullable = true)
 |-- Pengalaman: integer (nullable = true)



In [33]:
df1.columns

['Nama', 'Umur', 'Pengalaman']

In [38]:
df1.select('Nama','Umur').show()

+-----+----+
| Nama|Umur|
+-----+----+
|Hanif|  23|
| Dika|  30|
| Tino|  32|
| Muiz|  28|
+-----+----+



In [40]:
df1.describe().show() 

+-------+----+-----------------+------------------+
|summary|Nama|             Umur|        Pengalaman|
+-------+----+-----------------+------------------+
|  count|   4|                4|                 4|
|   mean|null|            28.25|               5.5|
| stddev|null|3.862210075418823|2.6457513110645907|
|    min|Dika|               23|                 2|
|    max|Tino|               32|                 8|
+-------+----+-----------------+------------------+



In [53]:
# add columns
df1 = df1.withColumn('Experience After 2 year',df1['Pengalaman']+2)
df1.show()

+-----+----+----------+-----------------------+
| Nama|Umur|Pengalaman|Experience After 2 year|
+-----+----+----------+-----------------------+
|Hanif|  23|         2|                      4|
| Dika|  30|         8|                     10|
| Tino|  32|         5|                      7|
| Muiz|  28|         7|                      9|
+-----+----+----------+-----------------------+



In [54]:
# drop columns
df1 = df1.drop('Experience After 2 year')
df1.show()

+-----+----+----------+
| Nama|Umur|Pengalaman|
+-----+----+----------+
|Hanif|  23|         2|
| Dika|  30|         8|
| Tino|  32|         5|
| Muiz|  28|         7|
+-----+----+----------+



In [56]:
# rename column
df1 = df1.withColumnRenamed('Nama','Nama Baru')
df1.show()

+---------+----+----------+
|Nama Baru|Umur|Pengalaman|
+---------+----+----------+
|    Hanif|  23|         2|
|     Dika|  30|         8|
|     Tino|  32|         5|
|     Muiz|  28|         7|
+---------+----+----------+



## Part 3

    - Drop Rows
    - Parameter to Drop
    - Handling Missing value

In [59]:
df = spark.read.csv('test2.csv',header=True,inferSchema=True)
df.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [61]:
#drop null value
df.na.drop().show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [63]:
#drop null value with thresh
df.na.drop(thresh=2).show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
+---------+----+----------+------+



In [64]:
# drop null value with subset
df.na.drop(subset='Name').show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
+---------+----+----------+------+



In [66]:
# fill null value
df.na.fill('John Doe',subset='Name').show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
| John Doe|  34|        10| 38000|
| John Doe|  36|      null|  null|
+---------+----+----------+------+



In [71]:
# impute with new columns
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['age', 'Experience', 'Salary'], 
    outputCols=["{}_imputed".format(c) for c in ['age', 'Experience', 'Salary']]
    ).setStrategy("median")

In [72]:
imputer.fit(df).transform(df).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         1| 15000|         21|                 1|         15000|
|  Shubham|  23|         2| 18000|         23|                 2|         18000|
|   Mahesh|null|      null| 40000|         29|                 4|         40000|
|     null|  34|        10| 38000|         34|                10|         38000|
|     null|  36|      null|  null|         36|                 4|         20000|
+---------+----+----------+-

## Part 4
    
    - Filter Operation
    - &|==
    - ~

In [81]:
# Who People have Salary less than or same than 20.000
df.filter('Salary<=20000').select('Name').show()

+-------+
|   Name|
+-------+
|  Sunny|
|   Paul|
| Harsha|
|Shubham|
+-------+



In [82]:
# alternative
df.filter(df['Salary']<=20000).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [83]:
# Who People have Salary less than 20.000 and greater than 15.000
df.filter((df['Salary']<20000)&(df['Salary']>15000)).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



## Part 5

    - Groupby and aggregate functions

In [85]:
df = spark.read.csv('test3.csv',header=True,inferSchema=True)
df.show()

+---------+------------+------+
|     Name| Departments|salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [86]:
# groupby operation
df.groupBy('Name').sum('Salary').show()

+---------+-----------+
|     Name|sum(Salary)|
+---------+-----------+
|Sudhanshu|      35000|
|    Sunny|      12000|
|    Krish|      19000|
|   Mahesh|       7000|
+---------+-----------+



In [89]:
# group by departement which gives maximum salary
df.groupBy('Departments').max('Salary').show()

+------------+-----------+
| Departments|max(Salary)|
+------------+-----------+
|         IOT|      10000|
|    Big Data|       5000|
|Data Science|      20000|
+------------+-----------+



In [97]:
from pyspark.sql import functions as F

In [101]:
df.groupBy('Departments').agg(F.sum('Salary'),F.mean('Salary'),F.count('salary')).show()

+------------+-----------+-----------+-------------+
| Departments|sum(Salary)|avg(Salary)|count(salary)|
+------------+-----------+-----------+-------------+
|         IOT|      15000|     7500.0|            2|
|    Big Data|      15000|     3750.0|            4|
|Data Science|      43000|    10750.0|            4|
+------------+-----------+-----------+-------------+



## Part 6

    - MLlib

In [150]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Missing').getOrCreate()

In [151]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [168]:
## Read The dataset
training = spark.read.csv('test2.csv',header=True,inferSchema=True)
training = training.na.drop()
training.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [169]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [170]:
from pyspark.ml.feature import VectorAssembler

In [171]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["age","Experience"],outputCol="Independent Features")

In [172]:
output=featureassembler.transform(training)

In [173]:
output.columns

['Name', 'age', 'Experience', 'Salary', 'Independent Features']

In [174]:
output.show()

+---------+---+----------+------+--------------------+
|     Name|age|Experience|Salary|Independent Features|
+---------+---+----------+------+--------------------+
|    Krish| 31|        10| 30000|         [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|          [30.0,8.0]|
|    Sunny| 29|         4| 20000|          [29.0,4.0]|
|     Paul| 24|         3| 20000|          [24.0,3.0]|
|   Harsha| 21|         1| 15000|          [21.0,1.0]|
|  Shubham| 23|         2| 18000|          [23.0,2.0]|
+---------+---+----------+------+--------------------+



In [177]:
final = output.select('Independent Features','Salary')
final.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [30.0,8.0]| 25000|
|          [29.0,4.0]| 20000|
|          [24.0,3.0]| 20000|
|          [21.0,1.0]| 15000|
|          [23.0,2.0]| 18000|
+--------------------+------+



In [180]:
from pyspark.ml.regression import LinearRegression
train,test = final.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol='Independent Features',labelCol='Salary')
regressor = regressor.fit(train)

In [183]:
pred = regressor.evaluate(test)
pred.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [21.0,1.0]| 15000|17429.234338747105|
|         [31.0,10.0]| 30000| 27547.56380510443|
+--------------------+------+------------------+



In [184]:
pred.meanAbsoluteError,pred.meanSquaredError

(2440.8352668213374, 5957811.381290973)