In [None]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.2'
spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [1]:
# Start Spark session
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Hashing").getOrCreate()

In [2]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
import nltk
nltk.download('averaged_perceptron_tagger')
import pandas as pd
from collections import Counter
from  itertools import chain
import numpy as np
import ast as ast
import string
import re
import warnings
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')
nltk.download('wordnet')
stopword = nltk.corpus.stopwords.words("english")
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ckkoc\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ckkoc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ckkoc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Pre-Processing


In [3]:
#Import CSV into Data
file_path = "job_list_scientist.csv"
csv_df = spark.read.csv(path=file_path, sep=",", header=True)
csv_df.show()

+--------------------+--------------------+--------------------+--------------------+
|           job_title|        company_name|            location|             summary|
+--------------------+--------------------+--------------------+--------------------+
|      Data Scientist|                 EOS|         Fremont. CA|Experience with d...|
|      Data Scientist|              TikTok|   Mountain View. CA|Lead data-driven ...|
|      Data Scientist|Environmental Ris...|Sunnyvale. CA+1 l...|Responsible for m...|
|Data Annotation S...|     Steady Platform|Playa Vista. CA•R...|We are looking fo...|
|Data Scientist. M...|             Discord|   San Francisco. CA|If you're passion...|
|Data Scientist. N...|              TikTok|   Mountain View. CA|Hands on experien...|
|      Data Scientist|             Zscaler|San Jose. CA 9513...|We are looking fo...|
|      Data Scientist|               Intel|    Folsom. CA 95630|Uses predictive m...|
|      Data Scientist|         Pocket Gems|   San Fran

In [4]:
# Tokenize the words
tokenizer = Tokenizer(inputCol="summary", outputCol="tokens")
wordsData2 = tokenizer.transform(csv_df)
wordsData2.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|           job_title|        company_name|            location|             summary|              tokens|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|      Data Scientist|                 EOS|         Fremont. CA|Experience with d...|[experience, with...|
|      Data Scientist|              TikTok|   Mountain View. CA|Lead data-driven ...|[lead, data-drive...|
|      Data Scientist|Environmental Ris...|Sunnyvale. CA+1 l...|Responsible for m...|[responsible, for...|
|Data Annotation S...|     Steady Platform|Playa Vista. CA•R...|We are looking fo...|[we, are, looking...|
|Data Scientist. M...|             Discord|   San Francisco. CA|If you're passion...|[if, you're, pass...|
|Data Scientist. N...|              TikTok|   Mountain View. CA|Hands on experien...|[hands, on, exper...|
|      Data Scientist|             Zs

In [5]:
#Create Remover
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")

In [6]:
#Drop Stop Words
cleaneddf = remover.transform(wordsData2)
cleaneddf.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|           job_title|        company_name|            location|             summary|              tokens|            filtered|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      Data Scientist|                 EOS|         Fremont. CA|Experience with d...|[experience, with...|[experience, data...|
|      Data Scientist|              TikTok|   Mountain View. CA|Lead data-driven ...|[lead, data-drive...|[lead, data-drive...|
|      Data Scientist|Environmental Ris...|Sunnyvale. CA+1 l...|Responsible for m...|[responsible, for...|[responsible, man...|
|Data Annotation S...|     Steady Platform|Playa Vista. CA•R...|We are looking fo...|[we, are, looking...|[looking, detail-...|
|Data Scientist. M...|             Discord|   San Francisco. CA|If you're passion...|[if, you're, pass..

In [7]:
#Convert to Pandas Dataframe
pandasDF = cleaneddf.toPandas()
pandasDF.head()

Unnamed: 0,job_title,company_name,location,summary,tokens,filtered
0,Data Scientist,EOS,Fremont. CA,Experience with data visualization libraries s...,"[experience, with, data, visualization, librar...","[experience, data, visualization, libraries, m..."
1,Data Scientist,TikTok,Mountain View. CA,Lead data-driven projects from definition to e...,"[lead, data-driven, projects, from, definition...","[lead, data-driven, projects, definition, exec..."
2,Data Scientist,Environmental Risk Management Inc,Sunnyvale. CA+1 location•Temporarily Remote,Responsible for managing and developing the co...,"[responsible, for, managing, and, developing, ...","[responsible, managing, developing, companies,..."
3,Data Annotation Specialist (part-time),Steady Platform,Playa Vista. CA•Remote,We are looking for detail-oriented individuals...,"[we, are, looking, for, detail-oriented, indiv...","[looking, detail-oriented, individuals, collab..."
4,Data Scientist. Machine Learning,Discord,San Francisco. CA,If you're passionate about data. impact. and w...,"[if, you're, passionate, about, data., impact....","[passionate, data., impact., working, amazing,..."


In [8]:
#turn filtered text into a dataframe
countable =pandasDF.drop(columns = ["job_title", "company_name", "location", "summary", "tokens"])

#remove punctuation and turn column into a list
countable['filtered'] = countable['filtered'].astype(str)
countable["filtered"] = countable['filtered'].str.replace('(','')
countable["filtered"] = countable['filtered'].str.replace(')','')
countable["filtered"] = countable['filtered'].str.replace('?','')
countable["filtered"] = countable['filtered'].str.replace('.','')
countable["filtered"] = countable['filtered'].str.replace('/','')
countable["filtered"] = countable['filtered'].str.replace('&','')
countable["filtered"] = countable["filtered"].apply(eval)
countable

Unnamed: 0,filtered
0,"[experience, data, visualization, libraries, m..."
1,"[lead, data-driven, projects, definition, exec..."
2,"[responsible, managing, developing, companies,..."
3,"[looking, detail-oriented, individuals, collab..."
4,"[passionate, data, impact, working, amazing, t..."
...,...
1048,"[apply, advanced, data, mining, machine, learn..."
1049,"[we’re, looking, hardworking, data, scientists..."
1050,"[collaborates, machine, learning, data, scienc..."
1051,"[develops, technical, tools, programming, leve..."


## Part-of-Speech and Word Count

In [9]:
#use sortvalues to count the number of times each word appears
a = pd.Series([item for sublist in countable.filtered for item in sublist])
counted_df = a.value_counts().sort_index().rename_axis('filtered').reset_index(name='count')
counted_df.sort_values(by=['count'], ascending=False, inplace=True)
counted_df= counted_df[counted_df.filtered != ""]
counted_df.head()


Unnamed: 0,filtered,count
517,data,1122
1148,learning,545
1197,machine,460
771,experience,357
139,analytics,205


In [10]:
counted_df['pos'] = counted_df['filtered'].apply(lambda x: nltk.pos_tag([x])[0][1])
counted_df.head()

Unnamed: 0,filtered,count,pos
517,data,1122,NNS
1148,learning,545,VBG
1197,machine,460,NN
771,experience,357,NN
139,analytics,205,NNS


In [11]:
def lemmatizing(tnp):
    text = wn.lemmatize(tnp)
    return text
counted_df["lemmatize"] = counted_df["filtered"].apply(lambda x: lemmatizing(x))
counted_df.head()

Unnamed: 0,filtered,count,pos,lemmatize
517,data,1122,NNS,data
1148,learning,545,VBG,learning
1197,machine,460,NN,machine
771,experience,357,NN,experience
139,analytics,205,NNS,analytics


In [12]:
total_df = counted_df.groupby(["lemmatize", "pos"]).agg({'count': ["sum"]})
total_df = pd.DataFrame(total_df)
total_df.columns = ['Total_Count']
total_df.sort_values(by=['Total_Count'], ascending=False, inplace=True)
total_df.to_csv('pos_count_scientist.csv')
total_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Total_Count
lemmatize,pos,Unnamed: 2_level_1
data,NNS,1122
learning,VBG,545
machine,NN,460
experience,NN,357
analytics,NNS,205
