In [4]:
import nltk
from nltk.stem import WordNetLemmatizer
import findspark

In [5]:
# To make pyspark importable as a regular library
findspark.init()

In [7]:
import pyspark.sql.functions as f
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.types import StringType

In [None]:
def normalize_salary(df):
# Feature Engineering - Normalization 
# Normalizing the salary to annual level 
    normal_pre_df = df.withColumn("Annual Salary From",
                       when(col("Salary Frequency") == "Hourly", col("Salary Range From") * 2080)  # Assuming 2080 work hours per year
                       .when(col("Salary Frequency") == "Weekly", col("Salary Range From") * 52)
                       .when(col("Salary Frequency") == "Daily", col("Salary Range From") * 260)  # Assuming 5 workdays per week
                       .otherwise(col("Salary Range From")))

    normalized_df = normal_pre_df.withColumn("Annual Salary To",
                       when(col("Salary Frequency") == "Hourly", col("Salary Range To") * 2080)
                       .when(col("Salary Frequency") == "Weekly", col("Salary Range To") * 52)
                       .when(col("Salary Frequency") == "Daily", col("Salary Range To") * 260)
                       .otherwise(col("Salary Range To")))
    
    return normalized_df


In [4]:
def text_preparation(df, col_name):
    #Cleaning - Remove punctuation
    #remove punctuation
    rm_punctutation_df = df.withColumn(col_name +'_non_punch',f.coalesce(f.regexp_replace(f.col(col_name), "[^\w\s]+|[ \t]+$|[\d]|^[ \t]", ""),f.lit('')))
    
    #Tokenization - Split into words
    tokenizer = Tokenizer(inputCol=col_name +'_non_punch', outputCol=col_name + '_splitted')
    token_df = tokenizer.transform(rm_punctutation_df)
       
    #Remove Stop words - Remove words which doesn't hold any value
    remover = StopWordsRemover(inputCol=col_name + '_splitted',outputCol=col_name + '_removed')
    remover_df = remover.transform(token_df)
    
    #Lemmatization - converting the word into its dictionary form
    lemmatizer = WordNetLemmatizer()
    # Define a UDF for lemmatization
    def lemmatize_text(words):
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word != '']
        return " ".join(lemmatized_words)

    lemmatize_udf = f.udf(lemmatize_text, StringType())
    lemmatized_df = remover_df.withColumn(col_name + "_lemmatized", lemmatize_udf(remover_df[col_name + '_removed']))
    
    # droppoing intermediate columns 
    drop_col_list = [col_name +'_non_punch',col_name + '_splitted',col_name + '_removed']
    
    clean_df = lemmatized_df.withColumnRenamed(col_name + "_lemmatized", col_name + "_prepared").drop(*drop_col_list)
    
    return clean_df
    
    
    
    

In [None]:
print("Data Preparation Functions Imported")