In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,concat_ws

## *****PySpark DF Question*****
## 1) From the below file please display the data like below:
## FileName: usdata.csv
## first_name,last_name,
## company_name,address -->  co_address [ Data should look like: ('|')  xyz|mumbai ]
## city,county,state --> geo_loc ('~') [ Data should look like: Powai~Ind~MH ]
## zip,age,phone1,phone2,email,web
## Note: 
## Impose Schema using StructType & StructField.

In [2]:
spark = SparkSession.builder.appName('learning').master('local').getOrCreate()
sc = spark.sparkContext

In [5]:
usdata_df = spark.read.format('csv')\
                      .option('delimiteer',',')\
                      .option('header','True')\
                      .option('inferSchema','True')\
                      .load('file:///home/saif/LFS/datasets/usdata.csv')

usdata_df.printSchema()
#usdata_df.show(5)

root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- phone1: string (nullable = true)
 |-- phone2: string (nullable = true)
 |-- email: string (nullable = true)
 |-- web: string (nullable = true)



In [4]:
from pyspark.sql.types import StructType, StringType, StructField,IntegerType

In [8]:
data_schema = StructType([
           StructField('first_name',StringType(),True),
           StructField('last_name',StringType(),True),
           StructField('company_name',StringType(),True),
           StructField('address',StringType(),True),
           StructField('city',StringType(),True),
           StructField('county',StringType(),True),
           StructField('state',StringType(),True),
           StructField('zip',IntegerType(),True),
           StructField('age',IntegerType(),True),
           StructField('phone1',StringType(),True),
           StructField('phone2',StringType(),True),
           StructField('email',StringType(),True),
           StructField('web',StringType(),True)
])


In [9]:
usdata_sc_df = spark.read.format('csv')\
                      .option('delimiteer',',')\
                      .option('header','True')\
                      .schema(data_schema)\
                      .load('file:///home/saif/LFS/datasets/usdata.csv')

usdata_sc_df.printSchema()
usdata_sc_df.show(5)

root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- phone1: string (nullable = true)
 |-- phone2: string (nullable = true)
 |-- email: string (nullable = true)
 |-- web: string (nullable = true)

+----------+---------+--------------------+--------------------+-----------+----------+-----+-----+---+------------+------------+--------------------+--------------------+
|first_name|last_name|        company_name|             address|       city|    county|state|  zip|age|      phone1|      phone2|               email|                 web|
+----------+---------+--------------------+--------------------+-----------+----------+-----+-----+---+------------+------------+--------------------+---

In [16]:
result_df = usdata_sc_df.select(
        col('first_name'),col("last_name"),
        concat_ws('|',col("company_name"),col("address")).alias('company_name,address'),
        concat_ws('~',col("city"),col("county"),col("state")).alias('city,county,state'),
        col('zip'),col('age'),col('phone1'),col('phone2'),col('email'),col('web')
)
result_df.show(5)
result_df.printSchema()

+----------+---------+--------------------+--------------------+-----+---+------------+------------+--------------------+--------------------+
|first_name|last_name|company_name,address|   city,county,state|  zip|age|      phone1|      phone2|               email|                 web|
+----------+---------+--------------------+--------------------+-----+---+------------+------------+--------------------+--------------------+
|     James|    durai|Benton, John B|66...|New Orleans~Orlea...|70116|  9|504-621-8927|504-845-1427|     jbutt@gmail.com|http://www.benton...|
| Josephine|  Darakjy|Chanay, Jeffrey A...|Brighton~Livingst...|48116|  8|810-292-9388|810-374-9840|josephine_darakjy...|http://www.chanay...|
|       Art|   Venere|Chemel, James L C...|Bridgeport~Glouce...| 8014|  7|856-636-8749|856-264-4130|      art@venere.org|http://www.chemel...|
|     Lenna| Paprocki|Feltz Printing Se...|Anchorage~Anchora...|99501| 10|907-385-4412|907-921-2010|lpaprocki@hotmail...|http://www.feltzp...|

In [18]:
result_schema = StructType([
    StructField('first_name',StringType(),True),
    StructField('last_name',StringType(),True),
    StructField('company_name,address',StringType(),True),
    StructField('city,county,state',StringType(),True),
    StructField('zip',IntegerType(),True),
    StructField('age',IntegerType(),True),
    StructField('phone1',StringType(),True),
    StructField('phone2',StringType(),True),
    StructField('email',StringType(),True),
    StructField('web',StringType(),True) 
])

In [19]:
newDF = spark.createDataFrame(result_df.rdd, schema=result_schema)

In [20]:
newDF.printSchema()
newDF.show(5)

root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- company_name,address: string (nullable = true)
 |-- city,county,state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- phone1: string (nullable = true)
 |-- phone2: string (nullable = true)
 |-- email: string (nullable = true)
 |-- web: string (nullable = true)

+----------+---------+--------------------+--------------------+-----+---+------------+------------+--------------------+--------------------+
|first_name|last_name|company_name,address|   city,county,state|  zip|age|      phone1|      phone2|               email|                 web|
+----------+---------+--------------------+--------------------+-----+---+------------+------------+--------------------+--------------------+
|     James|    durai|Benton, John B|66...|New Orleans~Orlea...|70116|  9|504-621-8927|504-845-1427|     jbutt@gmail.com|http://www.benton...|
| Josephine|  Dara