In [2]:
import findspark
findspark.init("C:\spark")

from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

spark = SparkSession.builder. \
master("local[4]"). \
appName("Df-clean"). \
config("spark.driver.memory","2g"). \
config("spark.executor.memory","4g"). \
getOrCreate()

sc = spark.sparkContext

In [3]:
df = spark.read \
.option("sep",";") \
.option("header" , "True") \
.option("inferSchema" , "True") \
.csv("Data\\OnlineRetail.csv")

In [4]:
df.show(3)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2,55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2,75|     17850|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 3 rows



In [5]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [6]:
from pyspark.sql.types import StringType,StringType,StructType,StructField,IntegerType,FloatType

In [7]:
manual_schema = StructType(

[
    StructField("InvoiceNo" , StringType() ,True),
    StructField("StockCode" , StringType() ,True),
    StructField("Description" , StringType() ,True),
    StructField("Quantity" , IntegerType() ,True),
    StructField("InvoiceDate" , StringType() ,True),
    StructField("UnitPrice" , FloatType() ,True),
    StructField("CustomerID" , IntegerType() ,True),
    StructField("Country" , StringType() ,True)
    
]

)

In [8]:
df2 = spark.read \
.option("sep",";") \
.option("header" , "True") \
.schema(manual_schema) \
.csv("Data\\OnlineRetail.csv")

In [9]:
df2.show(5)

+---------+---------+-----------+--------+-----------+---------+----------+-------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+-----------+--------+-----------+---------+----------+-------+
|     null|     null|       null|    null|       null|     null|      null|   null|
|     null|     null|       null|    null|       null|     null|      null|   null|
|     null|     null|       null|    null|       null|     null|      null|   null|
|     null|     null|       null|    null|       null|     null|      null|   null|
|     null|     null|       null|    null|       null|     null|      null|   null|
+---------+---------+-----------+--------+-----------+---------+----------+-------+
only showing top 5 rows



In [10]:
##noktalarda sıkıntı var bu virgülleri düzeltip tekrar yazdırıp tekrar okıcam virgülleri regex ile noktaya çeviricem le 

In [11]:
from pyspark.sql.functions import *

df = spark.read \
.option("sep",";") \
.option("header" , "True") \
.option("inferSchema" , "True") \
.csv("Data\\OnlineRetail.csv") \
.withColumn("UnitPrice",regexp_replace(col("UnitPrice"), "," , "."))


In [12]:
df.show(3)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2.75|     17850|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 3 rows



In [13]:
df \
.coalesce(1) \
.write \
.mode("overwrite") \
.option("sep", ";") \
.option("header","True") \
.csv("Data\\OnlineRetail_with_Nokta")

In [15]:
df2 = spark.read \
.option("sep",";") \
.option("header" , "True") \
.schema(manual_schema) \
.csv("Data\\OnlineRetail_with_Nokta")

In [17]:
df2.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: float (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [18]:
##işte bu kadarrr