In [3]:
import findspark
findspark.init("C:\spark")

from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

spark = SparkSession.builder. \
master("local[4]"). \
appName("Df-string"). \
config("spark.driver.memory","2g"). \
config("spark.executor.memory","4g"). \
getOrCreate()

sc = spark.sparkContext

In [4]:
simple_df =spark.read \
.option("sep",",") \
.option("header" , "True") \
.option("inferSchema" , "True") \
.csv("Data\\simple_dirty.csv")

In [5]:
from pyspark.sql.functions import *

# 1.Concat

In [10]:
df_concat = simple_df \
.withColumn("meslekvesehir", concat(col("meslek"), lit("-"),col("sehir")))

df_concat.show(n=10 , truncate=False)

+------+--------+---+--------+-----------+-----------+-----------+----------------------+----------------------+
|sirano|isim    |yas|cinsiyet|meslek     |sehir      |aylik_gelir|mal_mulk              |meslekvesehir         |
+------+--------+---+--------+-----------+-----------+-----------+----------------------+----------------------+
|1     |Cemal   |35 |E       |Isci       |Ankara     |3500.0     |araba                 |Isci-Ankara           |
|2     |ceyda   |42 |K       |Memur      |Kayseri    |4200.0     |araba|ev              |Memur-Kayseri         |
|3     |Timur   |30 |null    |Müzüsyen   |Istanbul   |9000.0     |araba|ev|yazlık       |Müzüsyen-Istanbul     |
|4     |Burcu   |29 |K       |Pazarlamacı|    Ankara |4200.0     |araba                 |Pazarlamacı-    Ankara|
|5     |Yasemin |23 |K       |Pazarlamaci|Bursa      |4800.0     |araba                 |Pazarlamaci-Bursa     |
|6     | Ali    |33 |E       |Memur      |Ankara     |4250.0     |ev                    |Memur-A

# 2.NumberFormat

In [16]:
df_number_format = simple_df \
.withColumn("aylik_gelir_format",format_number(col("aylik_gelir"), 3))

df_number_format.show(3)

+------+------+---+--------+--------+-----------+-----------+---------------+------------------+
|sirano|  isim|yas|cinsiyet|  meslek|      sehir|aylik_gelir|       mal_mulk|aylik_gelir_format|
+------+------+---+--------+--------+-----------+-----------+---------------+------------------+
|     1| Cemal| 35|       E|    Isci|     Ankara|     3500.0|          araba|         3,500.000|
|     2|ceyda | 42|       K|   Memur|    Kayseri|     4200.0|       araba|ev|         4,200.000|
|     3| Timur| 30|    null|Müzüsyen|Istanbul   |     9000.0|araba|ev|yazlık|         9,000.000|
+------+------+---+--------+--------+-----------+-----------+---------------+------------------+
only showing top 3 rows



# 3.lower,initcap,length

In [18]:
df_lower = simple_df \
.withColumn("meslek_lower" , lower(col("meslek"))) \
.withColumn("isim_initcap" ,initcap(col("isim"))) \
.withColumn("sehir_length" ,length(col("sehir")))

df_lower.show(n=5,truncate = False)

+------+-------+---+--------+-----------+-----------+-----------+---------------+------------+------------+------------+
|sirano|isim   |yas|cinsiyet|meslek     |sehir      |aylik_gelir|mal_mulk       |meslek_lower|isim_initcap|sehir_length|
+------+-------+---+--------+-----------+-----------+-----------+---------------+------------+------------+------------+
|1     |Cemal  |35 |E       |Isci       |Ankara     |3500.0     |araba          |isci        |Cemal       |6           |
|2     |ceyda  |42 |K       |Memur      |Kayseri    |4200.0     |araba|ev       |memur       |Ceyda       |7           |
|3     |Timur  |30 |null    |Müzüsyen   |Istanbul   |9000.0     |araba|ev|yazlık|müzüsyen    |Timur       |11          |
|4     |Burcu  |29 |K       |Pazarlamacı|    Ankara |4200.0     |araba          |pazarlamacı |Burcu       |10          |
|5     |Yasemin|23 |K       |Pazarlamaci|Bursa      |4800.0     |araba          |pazarlamaci |Yasemin     |5           |
+------+-------+---+--------+---

# 4.Trim

In [20]:
df_trim = simple_df \
.withColumn("sehir_trim" , trim(col("sehir")))

df_trim.show()

+------+---------+---+--------+-----------+-----------+-----------+--------------------+----------+
|sirano|     isim|yas|cinsiyet|     meslek|      sehir|aylik_gelir|            mal_mulk|sehir_trim|
+------+---------+---+--------+-----------+-----------+-----------+--------------------+----------+
|     1|    Cemal| 35|       E|       Isci|     Ankara|     3500.0|               araba|    Ankara|
|     2|   ceyda | 42|       K|      Memur|    Kayseri|     4200.0|            araba|ev|   Kayseri|
|     3|    Timur| 30|    null|   Müzüsyen|Istanbul   |     9000.0|     araba|ev|yazlık|  Istanbul|
|     4|   Burcu | 29|       K|Pazarlamacı|     Ankara|     4200.0|               araba|    Ankara|
|     5|  Yasemin| 23|       K|Pazarlamaci|      Bursa|     4800.0|               araba|     Bursa|
|     6|      Ali| 33|       E|      Memur|     Ankara|     4250.0|                  ev|    Ankara|
|     7|    Dilek| 29|       K|Pazarlamaci|   Istanbul|     7300.0|        araba|yazlık|  Istanbul|


# 5.Replace , split

In [24]:
df_replace = simple_df \
.withColumn("sehir_ist" , regexp_replace(col("sehir") , "Ist" , "IST")) \
.withColumn("mal_mulk_split" , split(col("mal_mulk") , "\\|")) \
.withColumn("mal_mulk_ilk_eleman_al" , col("mal_mulk_split")[1])

df_replace.show(n=4 , truncate = False)

+------+------+---+--------+-----------+-----------+-----------+---------------+-----------+-------------------+----------------------+
|sirano|isim  |yas|cinsiyet|meslek     |sehir      |aylik_gelir|mal_mulk       |sehir_ist  |mal_mulk_split     |mal_mulk_ilk_eleman_al|
+------+------+---+--------+-----------+-----------+-----------+---------------+-----------+-------------------+----------------------+
|1     |Cemal |35 |E       |Isci       |Ankara     |3500.0     |araba          |Ankara     |[araba]            |null                  |
|2     |ceyda |42 |K       |Memur      |Kayseri    |4200.0     |araba|ev       |Kayseri    |[araba, ev]        |ev                    |
|3     |Timur |30 |null    |Müzüsyen   |Istanbul   |9000.0     |araba|ev|yazlık|ISTanbul   |[araba, ev, yazlık]|ev                    |
|4     |Burcu |29 |K       |Pazarlamacı|    Ankara |4200.0     |araba          |    Ankara |[araba]            |null                  |
+------+------+---+--------+-----------+--------