In [19]:
import numpy as np
import pandas as pd

In [20]:
df = pd.read_csv('linkedin_job_postings.csv')

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1348454 entries, 0 to 1348453
Data columns (total 14 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   job_link             1348454 non-null  object
 1   last_processed_time  1348454 non-null  object
 2   got_summary          1348454 non-null  object
 3   got_ner              1348454 non-null  object
 4   is_being_worked      1348454 non-null  object
 5   job_title            1348454 non-null  object
 6   company              1348443 non-null  object
 7   job_location         1348435 non-null  object
 8   first_seen           1348454 non-null  object
 9   search_city          1348454 non-null  object
 10  search_country       1348454 non-null  object
 11  search_position      1348454 non-null  object
 12  job_level            1348454 non-null  object
 13  job_type             1348454 non-null  object
dtypes: object(14)
memory usage: 144.0+ MB


In [22]:
df = df[['job_title','company','job_link']]
df.head(3)

Unnamed: 0,job_title,company,job_link
0,Account Executive - Dispensing (NorCal/Norther...,BD,https://www.linkedin.com/jobs/view/account-exe...
1,Registered Nurse - RN Care Manager,Trinity Health MI,https://www.linkedin.com/jobs/view/registered-...
2,RESTAURANT SUPERVISOR - THE FORKLIFT,Wasatch Adaptive Sports,https://www.linkedin.com/jobs/view/restaurant-...


In [23]:
df = df.sample(n=1000,random_state=42)
df.shape

(1000, 3)

In [24]:
import nltk
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [25]:
ps = PorterStemmer()

In [26]:
def cleaning(txt):
    cleaned_txt = re.sub(r'[^a-zA-Z0-9\s]','',txt)
    tokens = nltk.word_tokenize(cleaned_txt.lower())
    stemming = [ ps.stem(word)
        for word in tokens if word not in stopwords.words('english')]
    
    return " ".join(stemming) 

In [27]:
cleaning('THis is a good Java Developer, \nSir')

'good java develop sir'

In [28]:
df['job_title'] = df['job_title'].astype(str).apply(lambda x:cleaning(x)) 
df['job_link'] = df['job_link'].astype(str).apply(lambda x:cleaning(x))

In [29]:
df['new_col'] = df['job_title'] +" "+ df["company"]+ " " + df['job_link']
df.head(3)

Unnamed: 0,job_title,company,job_link,new_col
766615,regist nurs rn aveanna,Health eCareers,httpswwwlinkedincomjobsviewregisterednursernat...,regist nurs rn aveanna Health eCareers httpsww...
158813,locum physician obstetr gynecolog,Weatherby Healthcare,httpswwwlinkedincomjobsviewlocumphysicianobste...,locum physician obstetr gynecolog Weatherby He...
697436,technolog sale recruit busi develop manag,Proven Recruiting,httpswwwlinkedincomjobsviewtechnologysalesrecr...,technolog sale recruit busi develop manag Prov...


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
tfif = TfidfVectorizer()
matrix = tfif.fit_transform(df['new_col'])
similarity = cosine_similarity(matrix)

In [32]:
similarity

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.02358749,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.02358749, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [33]:
enumerate(similarity[0])

<enumerate at 0x13df1fcf100>

In [34]:
def recommendation(title):
    idx = df[df['job_title'] == title].index[0]
    idx = df.index.get_loc(idx)
    distances = sorted(list(enumerate(similarity[idx])),key = lambda x:x[1], reverse=False)[1:20]

    jobs = []

    for i in distances:
        jobs.append(df.iloc[i[0]].Title)
        return jobs

In [35]:
df['job_title']

766615                                regist nurs rn aveanna
158813                     locum physician obstetr gynecolog
697436             technolog sale recruit busi develop manag
294621          travel rn oncolog 186200week 23731031expplat
654709     floor supervisor full time tommi hilfig la veg...
                                 ...                        
1316917                                       master teacher
530197                                  famili law solicitor
1224052     armi cybersecur strategi specialist secret clear
81954                            manag threat model assessor
1088132                       2nd shift custom servic repres
Name: job_title, Length: 1000, dtype: object

In [36]:
import pickle 
pickle.dump(df,open('df.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))