### Import Libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import *
import pyspark.sql.functions as fun
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType
import findspark
findspark.init()

In [2]:
import os
os.environ['SPARK_HOME'] = "C:/spark-3.5.0-bin-hadoop3/spark-3.5.0-bin-hadoop3"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

Create Spark Session

In [3]:
#spark driver: by default gives 1 partion 
#.config("spark.driver.host","localhost")
spark = SparkSession.builder.appName("firstApp").getOrCreate()

In [4]:
spark

In [11]:
data = spark.read.csv("linkdin_Job_data.csv", header='True')

In [12]:
data.show(1) #shows first 20 rows

+----------+--------------------+-------------------+----------+------------+---------+--------------------+--------------------+-----------------+--------------+-----------------+-------------+-------------------+------------------+--------------------+-------+
|    job_ID|                 job|           location|company_id|company_name|work_type|    full_time_remote|        no_of_employ|no_of_application|posted_day_ago|           alumni|Hiring_person| linkedin_followers|hiring_person_link|         job_details|Column1|
+----------+--------------------+-------------------+----------+------------+---------+--------------------+--------------------+-----------------+--------------+-----------------+-------------+-------------------+------------------+--------------------+-------+
|3471657636|Data Analyst, Tri...|Delhi, Delhi, India|      NULL|   Crossover|   Remote|Full-time · Assoc...|1,001-5,000 emplo...|              200|       8 hours|12 company alumni|         NULL|5,395,547 followe

In [13]:
#numbers of rows
print("dataSet contains:",data.count(),"rows") 

dataSet contains: 7927 rows


In [13]:
data.printSchema() #meta data of dataframe

root
 |-- job_ID: string (nullable = true)
 |-- job: string (nullable = true)
 |-- location: string (nullable = true)
 |-- company_id: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- full_time_remote: string (nullable = true)
 |-- no_of_employ: string (nullable = true)
 |-- no_of_application: string (nullable = true)
 |-- posted_day_ago: string (nullable = true)
 |-- alumni: string (nullable = true)
 |-- Hiring_person: string (nullable = true)
 |-- linkedin_followers: string (nullable = true)
 |-- hiring_person_link: string (nullable = true)
 |-- job_details: string (nullable = true)
 |-- Column1: string (nullable = true)



### Cleaning Data

In [14]:
#drop: column1,job_details
data = data.drop("Column1", "company_id", "posted_day_ago")

In [15]:
data.show(1)

+----------+--------------------+-------------------+------------+---------+--------------------+--------------------+-----------------+-----------------+-------------+-------------------+------------------+--------------------+
|    job_ID|                 job|           location|company_name|work_type|    full_time_remote|        no_of_employ|no_of_application|           alumni|Hiring_person| linkedin_followers|hiring_person_link|         job_details|
+----------+--------------------+-------------------+------------+---------+--------------------+--------------------+-----------------+-----------------+-------------+-------------------+------------------+--------------------+
|3471657636|Data Analyst, Tri...|Delhi, Delhi, India|   Crossover|   Remote|Full-time · Assoc...|1,001-5,000 emplo...|              200|12 company alumni|         NULL|5,395,547 followers|              NULL|About the job Cro...|
+----------+--------------------+-------------------+------------+---------+--------

In [15]:
data.printSchema()

root
 |-- job_ID: string (nullable = true)
 |-- job: string (nullable = true)
 |-- location: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- full_time_remote: string (nullable = true)
 |-- no_of_employ: string (nullable = true)
 |-- no_of_application: string (nullable = true)
 |-- alumni: string (nullable = true)
 |-- Hiring_person: string (nullable = true)
 |-- linkedin_followers: string (nullable = true)
 |-- hiring_person_link: string (nullable = true)
 |-- job_details: string (nullable = true)



Convert some columns into numeric

In [16]:
data = data.withColumn("alumni",regexp_extract("alumni",r'(\d+)',1).cast(IntegerType()))\
.withColumn("linkedin_followers", regexp_replace("linkedin_followers",r' followers', ''))\
.withColumn("no_of_application",col("no_of_application").cast('int'))

In [17]:
data.printSchema()

root
 |-- job_ID: string (nullable = true)
 |-- job: string (nullable = true)
 |-- location: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- full_time_remote: string (nullable = true)
 |-- no_of_employ: string (nullable = true)
 |-- no_of_application: integer (nullable = true)
 |-- alumni: integer (nullable = true)
 |-- Hiring_person: string (nullable = true)
 |-- linkedin_followers: string (nullable = true)
 |-- hiring_person_link: string (nullable = true)
 |-- job_details: string (nullable = true)



In [19]:
data.show()

+----------+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-----------------+------+--------------------+------------------+--------------------+--------------------+
|    job_ID|                 job|            location|        company_name|work_type|    full_time_remote|        no_of_employ|no_of_application|alumni|       Hiring_person|linkedin_followers|  hiring_person_link|         job_details|
+----------+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-----------------+------+--------------------+------------------+--------------------+--------------------+
|3471657636|Data Analyst, Tri...| Delhi, Delhi, India|           Crossover|   Remote|Full-time · Assoc...|1,001-5,000 emplo...|              200|    12|                NULL|         5,395,547|                NULL|About the job Cro...|
|3471669068|Data Analyst, Tri...|New Delhi, Delhi,...|      

Handling Missing Values

In [18]:
#Missing values
data = data.na.drop()
data.show(735)

+----------+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-----------------+------+--------------------+--------------------+--------------------+--------------------+
|    job_ID|                 job|            location|        company_name|work_type|    full_time_remote|        no_of_employ|no_of_application|alumni|       Hiring_person|  linkedin_followers|  hiring_person_link|         job_details|
+----------+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-----------------+------+--------------------+--------------------+--------------------+--------------------+
|3472808738|   Shopify Developer|        Delhi, India| Digital Impressions|  On-site|           Full-time|    51-200 employees|                2|     1|     Rashmi Aggarwal|               1,045|https://www.linke...|About the job The...|
|3467390929|       Data Engineer|Gurugram, Haryana..

In [19]:
data.count()

735

In [36]:
data.printSchema()

root
 |-- job_ID: string (nullable = true)
 |-- job: string (nullable = true)
 |-- location: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- full_time_remote: string (nullable = true)
 |-- no_of_employ: string (nullable = true)
 |-- no_of_application: integer (nullable = true)
 |-- alumni: integer (nullable = true)
 |-- Hiring_person: string (nullable = true)
 |-- linkedin_followers: string (nullable = true)
 |-- hiring_person_link: string (nullable = true)
 |-- job_details: string (nullable = true)



Description of statistics

In [20]:
# Show summary statistics for numeric columns
summary_stats = data.describe(["no_of_employ", "no_of_application"])
summary_stats.show()

+-------+--------------------+-----------------+
|summary|        no_of_employ|no_of_application|
+-------+--------------------+-----------------+
|  count|                 735|              735|
|   mean|                NULL|66.06666666666666|
| stddev|                NULL| 69.0618896050635|
|    min|1,001-5,000 emplo...|                1|
|    max|51-200 employees ...|              200|
+-------+--------------------+-----------------+



### Create RDD

In [25]:
sc = SparkContext.getOrCreate()
#data_rdd = sc.parallelize(data.collect()) #dataset is distributed into 4 partitions(clusters)
df_rdd = data.rdd

In [26]:
partitions = df_rdd.getNumPartitions()
print("initial partition count:",partitions,"partitions")


initial partition count: 5 partitions


In [442]:
#rows_tuples = [tuple(row.asDict().values()) for row in drdd]
#parall = sc.parallelize(rows_tuples)
#print(parall.collect())

### RDD Operation: Transformation

In [27]:
rs = df_rdd.map(lambda x: (x.job_ID,x.job)).collect()
rs

[('3472808738', 'Shopify Developer'),
 ('3467390929', 'Data Engineer'),
 ('3470730035', 'Data Engineer (Python)'),
 ('3467389203', 'Quickbase Developer'),
 ('3467360998', 'Developer'),
 ('3472813954', 'STIBO Lead developers (Functional)'),
 ('3474326981', 'Salesforce Developer Aura LWC'),
 ('3472810165', 'Salesforce Developer'),
 ('3472865558', 'GCP Data Engineer'),
 ('3470050334', 'AWS Data Engineer'),
 ('3474370726', 'Salesforce Developer'),
 ('3474270673', 'Salesforce Developer'),
 ('3467381520', 'AWS Data Engineer'),
 ('3467323896', 'PL/SQL Developer'),
 ('3467394021', 'Tibco BW'),
 ('3472591892', 'Thunderhead Developer'),
 ('3463535172', 'Sr Business Data Analyst (PowerBi)- Remote Work'),
 ('3467802155', 'Informatica Developer'),
 ('3471883106', 'HCL Hiring || ASP.Net MVC || Chennai,Bangalore,Noida'),
 ('3467350708', 'Data Engineering'),
 ('3474327672', 'Sr. Salesforce Developer'),
 ('3467381114', 'Snowflake Developer'),
 ('3467358339', 'Python Developer_AWS Lambda'),
 ('347433407

In [36]:
rdd2 = df_rdd.filter( lambda x: x['no_of_application'] > 10 ).count()
rdd2

545

### RDD Operation: Action

In [38]:
#Get first element
first_element = df_rdd.first()
print(first_element)

Row(job_ID='3472808738', job='Shopify Developer', location='Delhi, India', company_name='Digital Impressions', work_type='On-site', full_time_remote='Full-time', no_of_employ='51-200 employees', no_of_application=2, alumni=1, Hiring_person='Rashmi Aggarwal', linkedin_followers='1,045', hiring_person_link='https://www.linkedin.com/in/aggarwal-rashmi', job_details='About the job The ideal candidate will be responsible for conceptualizing and executing clear, quality code to develop the best software. You will test your code, identify errors, and iterate to ensure quality code. You will also support our customers and partners by troubleshooting any of their software issues. Roles and Responsibilities-Competent in Shopify app development and Shopify apps architecture.-Proficient knowledge of Shopify liquid code and its advanced concepts.-Candidate should have the expertise to design, develop, test, and deploy solutions based on the industry’s best practices as per organizational requiremen

In [40]:
#Reduce

In [41]:
#Retrieve n element using take()
take_data = df_rdd.take(5)
take_data

[Row(job_ID='3472808738', job='Shopify Developer', location='Delhi, India', company_name='Digital Impressions', work_type='On-site', full_time_remote='Full-time', no_of_employ='51-200 employees', no_of_application=2, alumni=1, Hiring_person='Rashmi Aggarwal', linkedin_followers='1,045', hiring_person_link='https://www.linkedin.com/in/aggarwal-rashmi', job_details='About the job The ideal candidate will be responsible for conceptualizing and executing clear, quality code to develop the best software. You will test your code, identify errors, and iterate to ensure quality code. You will also support our customers and partners by troubleshooting any of their software issues. Roles and Responsibilities-Competent in Shopify app development and Shopify apps architecture.-Proficient knowledge of Shopify liquid code and its advanced concepts.-Candidate should have the expertise to design, develop, test, and deploy solutions based on the industry’s best practices as per organizational requireme

In [42]:
#Count: return total number of RDD
count_rdd = df_rdd.count()
print("Total number of rdd elements: ",count_rdd)

Total number of rdd elements:  735


In [None]:
#sum of employes using sum()

In [21]:
#foreach

### SQL Operations

In [38]:
output = data.select(data.job, data.job_ID, data.work_type)\
    .where(data.work_type =='Remote').withColumn('iserting', fun.current_timestamp())\
        .orderBy(data.job_ID).cache()
#cache(): will store output dataframe in memory for faster operations
#withColumn() to create a temporary column

In [42]:
#Create temporary view
#output.createOrReplaceTempView("jobs_table")

Find Remote Jobs

In [46]:
spark.sql("select * from jobs_table where work_type ='Remote'").show()

+--------------------+----------+---------+--------------------+
|                 job|    job_ID|work_type|            iserting|
+--------------------+----------+---------+--------------------+
|Sales Development...|1607578529|   Remote|2024-02-09 20:33:...|
|Product Support A...|3105928271|   Remote|2024-02-09 20:33:...|
|Oracle Cloud Time...|3109700642|   Remote|2024-02-09 20:33:...|
|Senior Consultant...|3184377229|   Remote|2024-02-09 20:33:...|
|Senior Consultant...|3184377229|   Remote|2024-02-09 20:33:...|
|Consultant - Busi...|3290718339|   Remote|2024-02-09 20:33:...|
|Lead Data Enginee...|3344074918|   Remote|2024-02-09 20:33:...|
|  Database Developer|3355178624|   Remote|2024-02-09 20:33:...|
|Associate Princip...|3358839184|   Remote|2024-02-09 20:33:...|
|Salesforce Developer|3359595476|   Remote|2024-02-09 20:33:...|
|Senior Product Ma...|3369823728|   Remote|2024-02-09 20:33:...|
|Manager, Data Sci...|3384984493|   Remote|2024-02-09 20:33:...|
|Manager, Data Sci...|338

### Save RDDs outputs to text file and read from it

### 