In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
import os

In [3]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [4]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

In [6]:
!kaggle datasets download -d asaniczka/1-3m-linkedin-jobs-and-skills-2024 --unzip

Dataset URL: https://www.kaggle.com/datasets/asaniczka/1-3m-linkedin-jobs-and-skills-2024
License(s): ODC Attribution License (ODC-By)
Downloading 1-3m-linkedin-jobs-and-skills-2024.zip to /content
 99% 1.87G/1.88G [00:27<00:00, 117MB/s] 
100% 1.88G/1.88G [00:27<00:00, 72.6MB/s]


In [6]:
data = spark.read.csv('/content/job_skills.csv', header=True, inferSchema=True)

In [7]:
data = data.sample(0.01, seed=123)

In [8]:
data.take(5)

[Row(job_link='https://www.linkedin.com/jobs/view/assistant-center-operations-director-at-concentra-3800509465', job_skills='Service Mentality, Attention to Detail, Sense of Urgency, Initiative, Flexibility, Logic, Problemsolving, Customer service, Tact, Professionalism, HIPAA, PHI, Federal and state laws, Strong service mentality, Telephone etiquette, Personal etiquette, Warm, Positive, Energetic, Professional, Oral communication, Written communication, Tactful communication, Diplomatic communication, Personnel recruitment, Performance management, Performance assessment, Selfdevelopment, Proficient in computer applications, Word, Excel, Coordinating, Prioritizing, Multitasking, Opportunity identification, Plan of action development, Plan implementation, Plan evaluation, Conflict resolution, 401(k) Retirement Plan, Medical Plan, Vision Plan, Prescription Plan, Telehealth Plan, Dental Plan, Life and Disability Insurance, Paid Time Off, Extended Illness Days, Colleague Referral Bonus, Tu

In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_skills(skill):
    if skill is None or not isinstance(skill, str):
      return []

    tokens = word_tokenize(skill)
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words and word.isalnum()]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return list(set(lemmatized_tokens))

In [11]:
baskets = data.rdd.map(lambda x: preprocess_skills(x[1]))

In [12]:
baskets.take(5)

[['development',
  'logic',
  'oral',
  'warm',
  'disability',
  'referral',
  'insurance',
  'resolution',
  'law',
  'extended',
  'state',
  'attention',
  'k',
  'initiative',
  'proficient',
  'assessment',
  'customer',
  'management',
  'urgency',
  'employee',
  'account',
  'energetic',
  'phi',
  'tactful',
  'dependent',
  'identification',
  'selfdevelopment',
  'sense',
  'tuition',
  'prescription',
  'time',
  'implementation',
  'diplomatic',
  'word',
  'medical',
  'telehealth',
  'written',
  'problemsolving',
  'performance',
  'opportunity',
  'illness',
  'plan',
  'life',
  'flexibility',
  'hipaa',
  'communication',
  'tact',
  'detail',
  'positive',
  '401',
  'personnel',
  'application',
  'dental',
  'multitasking',
  'strong',
  'service',
  'benefit',
  'conflict',
  'personal',
  'telephone',
  'etiquette',
  'commuter',
  'reimbursement',
  'professional',
  'recruitment',
  'professionalism',
  'excel',
  'bonus',
  'action',
  'spending',
  'vision'

In [13]:
#total baskets
baskets.count()

12868

In [14]:
#taking the whole items in a list
skills = baskets.flatMap(lambda line: line)

In [15]:
skills.take(50)

['development',
 'logic',
 'oral',
 'warm',
 'disability',
 'referral',
 'insurance',
 'resolution',
 'law',
 'extended',
 'state',
 'attention',
 'k',
 'initiative',
 'proficient',
 'assessment',
 'customer',
 'management',
 'urgency',
 'employee',
 'account',
 'energetic',
 'phi',
 'tactful',
 'dependent',
 'identification',
 'selfdevelopment',
 'sense',
 'tuition',
 'prescription',
 'time',
 'implementation',
 'diplomatic',
 'word',
 'medical',
 'telehealth',
 'written',
 'problemsolving',
 'performance',
 'opportunity',
 'illness',
 'plan',
 'life',
 'flexibility',
 'hipaa',
 'communication',
 'tact',
 'detail',
 'positive',
 '401']

In [16]:
#total number of skills
skills.count()

457130

In [17]:
#total number of distinct items
dis_skills = skills.distinct().collect()
len(dis_skills)

19230

In [18]:
#transforming each unique skill to an integer
def encoding(item,basket):

  unique_skills = item.distinct().collect()

  to_int = {item: index for index, item in enumerate(unique_skills)}
  to_str = {index: item for item, index in to_int.items()}

  skill_en = item.map(lambda x: to_int[x])

  baskets_en = basket.map(lambda x: [to_int[item] for item in x])

  return baskets_en, skill_en, to_int, to_str

In [19]:
baskets_en, encoded_skills, to_int, to_str = encoding(skills, baskets)

In [20]:
encoded_skills.take(10)

[0, 9573, 6349, 9574, 12776, 12777, 12778, 15955, 12779, 9575]

In [21]:
baskets_en.take(2)

[[0,
  9573,
  6349,
  9574,
  12776,
  12777,
  12778,
  15955,
  12779,
  9575,
  1,
  3169,
  9576,
  3170,
  12780,
  15956,
  6350,
  2,
  6351,
  3171,
  3,
  3172,
  6352,
  15957,
  12781,
  15958,
  15959,
  3173,
  15960,
  4,
  15961,
  3174,
  3175,
  9577,
  5,
  6353,
  15962,
  12782,
  12783,
  6354,
  3176,
  12784,
  3177,
  15963,
  15964,
  15965,
  9578,
  12785,
  6355,
  3178,
  15966,
  15967,
  12786,
  6,
  9579,
  12787,
  7,
  15968,
  9580,
  12788,
  9581,
  15969,
  6356,
  8,
  9582,
  6357,
  9583,
  12789,
  9,
  6358,
  10,
  15970,
  12790,
  9584,
  15971,
  3179,
  3180,
  12791,
  3181,
  12792,
  6359,
  9585,
  3182],
 [11, 12, 0, 9586, 3183, 9587, 13, 15972, 15973]]

In [22]:
#computing the frequecy for each skill
def support(rdd):
  return (rdd.map(lambda item: (item, 1))\
            .reduceByKey(lambda a,b: a+b)\
            .sortBy(lambda a: a[1], False))

In [23]:
skills_support = support(encoded_skills)

In [24]:
skills_support.take(100)

[(15965, 6981),
 (2, 6551),
 (3193, 4565),
 (12787, 4398),
 (15972, 3688),
 (6350, 3654),
 (12805, 2773),
 (6394, 2731),
 (6382, 2477),
 (12810, 2417),
 (3205, 2360),
 (9584, 2342),
 (15961, 2230),
 (15991, 2220),
 (0, 2213),
 (12824, 2193),
 (15975, 2143),
 (3185, 2117),
 (5, 2114),
 (6376, 2081),
 (3245, 2055),
 (15986, 2040),
 (3216, 1953),
 (3189, 1943),
 (15992, 1897),
 (6371, 1776),
 (3224, 1736),
 (12782, 1714),
 (16011, 1676),
 (6510, 1675),
 (6385, 1647),
 (55, 1629),
 (6389, 1588),
 (36, 1564),
 (3243, 1562),
 (53, 1557),
 (9590, 1551),
 (9616, 1510),
 (12785, 1495),
 (12834, 1443),
 (12784, 1406),
 (12778, 1397),
 (12795, 1395),
 (48, 1391),
 (3169, 1385),
 (16026, 1375),
 (15971, 1375),
 (9600, 1358),
 (9606, 1356),
 (9611, 1333),
 (12812, 1331),
 (93, 1317),
 (15985, 1291),
 (6367, 1248),
 (12854, 1243),
 (12850, 1232),
 (88, 1230),
 (12833, 1197),
 (8, 1153),
 (9605, 1145),
 (12847, 1123),
 (9625, 1120),
 (6369, 1109),
 (104, 1064),
 (72, 1059),
 (6368, 1051),
 (10, 1048)

In [25]:
threshold = 12868 * 0.1

In [26]:
#adding a threshold to the supports and normalizing them by the total number of baskets
filtered_support = skills_support.filter(lambda item: item[1] >= threshold )\
                                  .map(lambda item: (item[0], item[1] / 13056))\
                                  .sortBy(lambda item: item[1], False)

In [27]:
filtered_support.take(10)

[(15965, 0.5346966911764706),
 (2, 0.5017616421568627),
 (3193, 0.34964767156862747),
 (12787, 0.3368566176470588),
 (15972, 0.2824754901960784),
 (6350, 0.27987132352941174),
 (12805, 0.21239276960784315),
 (6394, 0.20917585784313725),
 (6382, 0.18972120098039216),
 (12810, 0.18512561274509803)]

In [28]:
frequents = filtered_support.map(lambda item: item[0]).collect()

In [29]:
#finding binary and triple itemsets with their support
from itertools import combinations

def items_comb(c):

  candidate_itemsets = list(combinations(frequents, c))

  def itemsets(basket):
    return [(itemset, 1) for itemset in candidate_itemsets if all(item in basket for item in itemset)]

  candidates = baskets_en.flatMap(itemsets)\
                               .reduceByKey(lambda a, b: a + b)\
                               .filter(lambda itemset: itemset[1] >= threshold)\
                               .map(lambda item: (item[0], item[1] / 13056))\
                               .sortBy(lambda itemset: itemset[1], False)


  return candidates.collect()

In [30]:
binary_items = items_comb(2)
binary_items

[((15965, 2), 0.34329044117647056),
 ((15965, 3193), 0.28921568627450983),
 ((12787, 6350), 0.2424938725490196),
 ((15965, 12787), 0.23253676470588236),
 ((2, 3193), 0.22035845588235295),
 ((15965, 6350), 0.2115502450980392),
 ((2, 12787), 0.20235906862745098),
 ((2, 6350), 0.17769607843137256),
 ((15965, 15972), 0.17164522058823528),
 ((15965, 6382), 0.16881127450980393),
 ((15965, 12805), 0.16337316176470587),
 ((2, 12805), 0.16207107843137256),
 ((2, 15972), 0.1565563725490196),
 ((2, 15961), 0.14238664215686275),
 ((15965, 6394), 0.13840379901960784),
 ((3193, 12787), 0.13671875),
 ((15965, 15961), 0.13097426470588236),
 ((2, 12810), 0.13074448529411764),
 ((3193, 15972), 0.12913602941176472),
 ((15965, 12810), 0.12706801470588236),
 ((3185, 6376), 0.12277879901960784),
 ((3193, 6350), 0.12231924019607843),
 ((2, 6371), 0.12147671568627451),
 ((15965, 12782), 0.12055759803921569),
 ((3224, 53), 0.11925551470588236),
 ((15965, 3185), 0.1174938725490196),
 ((2, 6382), 0.1174938725490

In [31]:
len(binary_items)

50

In [32]:
binary_decoded =[(([to_str[item] for item in pair], support)) for pair, support in binary_items]
binary_decoded

[(['communication', 'management'], 0.34329044117647056),
 (['communication', 'skill'], 0.28921568627450983),
 (['service', 'customer'], 0.2424938725490196),
 (['communication', 'service'], 0.23253676470588236),
 (['management', 'skill'], 0.22035845588235295),
 (['communication', 'customer'], 0.2115502450980392),
 (['management', 'service'], 0.20235906862745098),
 (['management', 'customer'], 0.17769607843137256),
 (['communication', 'experience'], 0.17164522058823528),
 (['communication', 'teamwork'], 0.16881127450980393),
 (['communication', 'leadership'], 0.16337316176470587),
 (['management', 'leadership'], 0.16207107843137256),
 (['management', 'experience'], 0.1565563725490196),
 (['management', 'time'], 0.14238664215686275),
 (['communication', 'work'], 0.13840379901960784),
 (['skill', 'service'], 0.13671875),
 (['communication', 'time'], 0.13097426470588236),
 (['management', 'team'], 0.13074448529411764),
 (['skill', 'experience'], 0.12913602941176472),
 (['communication', 'te

In [33]:
triple_items = items_comb(3)
triple_items

[((15965, 2, 3193), 0.19010416666666666),
 ((15965, 12787, 6350), 0.18857230392156862),
 ((15965, 2, 12787), 0.1526501225490196),
 ((2, 12787, 6350), 0.15104166666666666),
 ((15965, 2, 6350), 0.1404718137254902),
 ((15965, 2, 12805), 0.13151041666666666),
 ((15965, 2, 15961), 0.12071078431372549),
 ((15965, 3193, 12787), 0.11795343137254902),
 ((15965, 2, 15972), 0.11144301470588236),
 ((15965, 2, 6382), 0.1094515931372549),
 ((15965, 3193, 15972), 0.10723039215686274),
 ((15965, 3193, 6350), 0.10677083333333333),
 ((15965, 3224, 53), 0.10631127450980392),
 ((3193, 12787, 6350), 0.10493259803921569)]

In [34]:
len(triple_items)

14

In [35]:
triple_decoded =[(([to_str[item] for item in triple], support)) for triple, support in triple_items]
triple_decoded

[(['communication', 'management', 'skill'], 0.19010416666666666),
 (['communication', 'service', 'customer'], 0.18857230392156862),
 (['communication', 'management', 'service'], 0.1526501225490196),
 (['management', 'service', 'customer'], 0.15104166666666666),
 (['communication', 'management', 'customer'], 0.1404718137254902),
 (['communication', 'management', 'leadership'], 0.13151041666666666),
 (['communication', 'management', 'time'], 0.12071078431372549),
 (['communication', 'skill', 'service'], 0.11795343137254902),
 (['communication', 'management', 'experience'], 0.11144301470588236),
 (['communication', 'management', 'teamwork'], 0.1094515931372549),
 (['communication', 'skill', 'experience'], 0.10723039215686274),
 (['communication', 'skill', 'customer'], 0.10677083333333333),
 (['communication', 'problem', 'solving'], 0.10631127450980392),
 (['skill', 'service', 'customer'], 0.10493259803921569)]

In [37]:
supports = filtered_support.collect()
supports = [(to_str[item], support) for item, support in supports]

In [38]:
supports

[('communication', 0.5346966911764706),
 ('management', 0.5017616421568627),
 ('skill', 0.34964767156862747),
 ('service', 0.3368566176470588),
 ('experience', 0.2824754901960784),
 ('customer', 0.27987132352941174),
 ('leadership', 0.21239276960784315),
 ('work', 0.20917585784313725),
 ('teamwork', 0.18972120098039216),
 ('team', 0.18512561274509803),
 ('license', 0.18075980392156862),
 ('care', 0.1793811274509804),
 ('time', 0.17080269607843138),
 ('safety', 0.17003676470588236),
 ('development', 0.16950061274509803),
 ('training', 0.16796875),
 ('degree', 0.16413909313725492),
 ('office', 0.16214767156862744),
 ('medical', 0.16191789215686275),
 ('microsoft', 0.15939031862745098),
 ('health', 0.15739889705882354),
 ('system', 0.15625),
 ('certification', 0.14958639705882354),
 ('data', 0.14882046568627452),
 ('analysis', 0.14529718137254902),
 ('project', 0.13602941176470587),
 ('problem', 0.1329656862745098),
 ('problemsolving', 0.13128063725490197),
 ('patient', 0.1283700980392156

In [47]:
supports_dict = {item: count for item, count in supports}
rules1 = []

for items, supp in binary_decoded:
  item1, item2 = items
  if item1 in supports_dict and item2 in supports_dict:
    antecdent= item1
    ante_supp = supports_dict[item1]
    confidence = (supp) / (ante_supp)
    consequent_supp = supports_dict[item2]
    consequent = item2
    lift = confidence / (consequent_supp)
    if confidence >= 0.7:
     rules1.append((item1, item2, round(supp,4), round(confidence, 4), round(lift, 4)))


  if item1 in supports_dict and item2 in supports_dict:
    antecedent = item2
    ante_supp = supports_dict[item2]
    confidence = (supp) / (ante_supp)
    consequent_supp = supports_dict[item1]
    consequent = item1
    lift = confidence / (consequent_supp)
    if confidence >= 0.7:
     rules1.append((item2, item1, round(supp,4), round(confidence, 4), round(lift, 4)))

In [48]:
rules1

[('skill', 'communication', 0.2892, 0.8272, 1.547),
 ('service', 'customer', 0.2425, 0.7199, 2.5722),
 ('customer', 'service', 0.2425, 0.8664, 2.5722),
 ('customer', 'communication', 0.2116, 0.7559, 1.4137),
 ('teamwork', 'communication', 0.1688, 0.8898, 1.6641),
 ('leadership', 'communication', 0.1634, 0.7692, 1.4386),
 ('leadership', 'management', 0.1621, 0.7631, 1.5208),
 ('time', 'management', 0.1424, 0.8336, 1.6614),
 ('time', 'communication', 0.131, 0.7668, 1.4341),
 ('team', 'management', 0.1307, 0.7062, 1.4075),
 ('office', 'microsoft', 0.1228, 0.7572, 4.7506),
 ('microsoft', 'office', 0.1228, 0.7703, 4.7506),
 ('project', 'management', 0.1215, 0.893, 1.7798),
 ('problemsolving', 'communication', 0.1206, 0.9183, 1.7175),
 ('problem', 'solving', 0.1193, 0.8969, 7.5207),
 ('solving', 'problem', 0.1193, 1.0, 7.5207),
 ('office', 'communication', 0.1175, 0.7246, 1.3552),
 ('microsoft', 'communication', 0.115, 0.7218, 1.3499),
 ('problem', 'communication', 0.1149, 0.8641, 1.616),
 (

In [49]:
len(rules1)

29

In [50]:
binary_dict = {tuple(item): count for item, count in binary_decoded}
rules2 = []

for items, supp in triple_decoded:
  item1, item2, item3 = items
  if (item1, item2) in binary_dict and item3 in supports_dict:
    antecdent= (item1, item2)
    ante_supp = binary_dict[(item1, item2)]
    confidence = supp / ante_supp
    consequent_supp = supports_dict[item3]
    consequent = item3
    lift = confidence / consequent_supp
    if confidence >= 0.7:
     rules2.append(((item1, item2), item3, round(supp,4), round(confidence, 4), round(lift, 4)))


  if item3 in supports_dict and (item1, item2) in binary_dict:
    antecedent = item3
    ante_supp = supports_dict[item3]
    confidence = supp / ante_supp
    consequent_supp = binary_dict[(item1, item2)]
    consequent = (item1, item2)
    lift = confidence / consequent_supp
    if confidence >= 0.7:
     rules2.append((item3, (item1, item2), round(supp,4), round(confidence, 4), round(lift, 4)))


In [51]:
rules2

[(('communication', 'service'), 'customer', 0.1886, 0.8109, 2.8975),
 (('management', 'service'), 'customer', 0.151, 0.7464, 2.667),
 ('time', ('communication', 'management'), 0.1207, 0.7067, 2.0587),
 (('communication', 'problem'), 'solving', 0.1063, 0.9253, 7.7592),
 ('solving', ('communication', 'problem'), 0.1063, 0.8915, 7.7592),
 (('skill', 'service'), 'customer', 0.1049, 0.7675, 2.7424)]

In [52]:
len(rules2)

6

In [53]:
import pandas as pd

df_rules1 = pd.DataFrame(rules1, columns=['Antecedent', 'Consequent', 'Support','Confidence', 'Lift'])

df_rules2 = pd.DataFrame(rules2, columns=['Antecedent', 'Consequent', 'Support', 'Confidence', 'Lift'])

rules = pd.concat([df_rules1, df_rules2], ignore_index=True)

In [55]:
rules_shuffled = rules.sample(frac=1).reset_index(drop=True)

rules_shuffled

Unnamed: 0,Antecedent,Consequent,Support,Confidence,Lift
0,time,management,0.1424,0.8336,1.6614
1,patient,care,0.1097,0.8544,4.7631
2,"(skill, service)",customer,0.1049,0.7675,2.7424
3,office,communication,0.1175,0.7246,1.3552
4,school,high,0.1059,0.8397,7.0097
5,office,microsoft,0.1228,0.7572,4.7506
6,solving,problem,0.1193,1.0,7.5207
7,solving,"(communication, problem)",0.1063,0.8915,7.7592
8,skill,communication,0.2892,0.8272,1.547
9,detail,attention,0.1052,0.9191,8.6637
