In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
# adding required packages
import findspark
import pprint
import matplotlib.pyplot as plt

In [3]:
#To make pyspark importable as a regular library
findspark.init()

In [4]:
#importing pyspark related package
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import col, when, avg, round, rank
from pyspark.sql.window import Window
from pyspark.sql.types import ArrayType, StringType

In [5]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover

In [6]:
import nltk
from nltk.stem import WordNetLemmatizer

In [7]:
import pyspark.sql.functions as f

In [8]:
%run /notebook/dataproduct/assesment_nyc_job_posting/utils/spark_session.ipynb

Spark Session Imported


In [9]:
# Local System Configuration
# Total Memory = 16GB
# Total Cores = 10

def get_spark_conf():
    # Configure Spark settings
    spark_conf = SparkConf()
    spark_conf.set("spark.executor.instances", "4") # 4 instance per node

    # Set the number of executor cores
    spark_conf.set("spark.executor.cores", "1")  # Use 1 cores per executor

    # Set the executor memory
    spark_conf.set("spark.executor.memory", "1g")  # Use 1GB memory per executor

    # Set the driver memory
    spark_conf.set("spark.driver.memory", "2g")    # Use 2GB memory for the driver
    
    return spark_conf

In [10]:
#setting spark conf before creating spark session
spark_conf = get_spark_conf()

# Create a SparkSession with the configured settings
spark = SparkSession.builder.config(conf=spark_conf).appName("MySparkApp").getOrCreate()

# Listing all the spark conf
spark.sparkContext.getConf().getAll()

# setting spark conf for analysis
spark.conf.set('spark.sql.repl.eagerEval.enabled',True)

#reading dataset
df = spark.read.csv("/dataset/nyc-jobs.csv", header=True, inferSchema=True, escape='"')

# reducing the shuffle partition to 4 
# reason 1 data size is very less
# reason 2 to use all the availble cores
spark.conf.set('spark.sql.shuffle.partitions',4)

In [11]:
df.select('Preferred Skills').distinct().limit(10).show(truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# remove punctuation

In [12]:
#remove punctuation
df = df.select('Preferred Skills', f.coalesce(f.regexp_replace("Preferred Skills", "[^\w\s]+", ""),f.lit('')).alias('preferred_skill_non_punch'))


In [13]:
df

Preferred Skills,preferred_skill_non_punch
â€¢	Excellent int...,Excellent interp...
,
1. A High School...,1 A High School ...
1. A High School...,1 A High School ...
,
,
Strong analytical...,Strong analytical...
In order to apply...,In order to apply...
1. Five years of...,1 Five years of ...
Experience with L...,Experience with L...


# Tokenization

In [14]:

tokenizer = Tokenizer(inputCol='preferred_skill_non_punch', outputCol='preferred_skill_splitted')

In [15]:
tk= tokenizer.transform(df)

In [16]:
df = tk.select('preferred_skill_non_punch','preferred_skill_splitted')

# Remove stop words

In [17]:
remover = StopWordsRemover(inputCol='preferred_skill_splitted',outputCol='preferred_skill_removed')

In [18]:
remover_df = remover.transform(df)

In [19]:
remover_df

preferred_skill_non_punch,preferred_skill_splitted,preferred_skill_removed
Excellent interp...,"[, excellent, int...","[, excellent, int..."
,[],[]
1 A High School ...,"[1, , a, high, sc...","[1, , high, schoo..."
1 A High School ...,"[1, , a, high, sc...","[1, , high, schoo..."
,[],[]
,[],[]
Strong analytical...,"[strong, analytic...","[strong, analytic..."
In order to apply...,"[in, order, to, a...","[order, apply, po..."
1 Five years of ...,"[1, , five, years...","[1, , five, years..."
Experience with L...,"[experience, with...","[experience, law,..."


# lemmatization

In [22]:
# Initialize NLTK lemmatizer
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
lemmatizer = WordNetLemmatizer()

In [26]:
# Define a UDF for lemmatization
def lemmatize_text(tokenize_words):
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokenize_words]
    return " ".join(lemmatized_words)

In [29]:
remover_df.toPandas()['preferred_skill_removed'].apply(lambda x: lemmatize_text(x) if x is not None else x)

0        excellent interpersonal organizational skill ...
1                                                        
2       1  high school diploma ged  2  cdl driver lice...
3       1  high school diploma ged  2  cdl driver lice...
4                                                        
                              ...                        
2941    1 thorough knowledge city state federal housin...
2942    1 thorough knowledge city state federal housin...
2943    strong preference candidate posse   proven int...
2944    strong preference candidate posse   proven int...
2945                                           error name
Name: preferred_skill_removed, Length: 2946, dtype: object