In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

In [2]:
"""THE PURPOSE OF THIS PROJECT IS TO BUILD A JOB RECOMMENDATION SYSTEM USING US jobs datasets,
--https://www.kaggle.com/datasets/JobsPikrHQ/usa-based-job-data-set-from-300-companies--
the dataset comprises of several job listings,the companies and the last dates to apply for the jobs and the job description
The dataset contains 24,376 rows and 8 columns
First I checked for the null values in the dataset and these were the results


company_name              0
geo                       0
JobTitle                  0
JobText                 233
JobLocation            8062
JobPostDate           18134
last_date_to_apply    2378

"""
df = pd.read_csv("C:\\Datasets\\US-based jobs.csv")
df=df.head(1000)
df.isnull().sum()


url                     0
company_name            0
geo                     0
JobTitle                0
JobText                 0
JobLocation           616
JobPostDate           688
last_date_to_apply    999
dtype: int64

In [3]:
# The method of data cleaning will be filling  with the most common elements 
df['last_date_to_apply'].value_counts().head(10)

2018-04-03T12:00:00+00:00    1
Name: last_date_to_apply, dtype: int64

In [4]:
df['last_date_to_apply'] = df['last_date_to_apply'].fillna('2018-01-01T12:00:00+00:00')

In [5]:
df['JobPostDate'].value_counts().head(10)

2017-10-06T12:00:00+00:00    69
2017-10-12T12:00:00+00:00    65
2017-10-09T12:00:00+00:00    44
2017-10-10T12:00:00+00:00    40
2017-10-02T12:00:00+00:00    40
2017-10-04T12:00:00+00:00    16
2017-10-03T12:00:00+00:00    12
2017-10-05T12:00:00+00:00     4
2017-09-28T12:00:00+00:00     4
2017-10-13T12:00:00+00:00     3
Name: JobPostDate, dtype: int64

In [6]:
df['JobPostDate'] = df['JobPostDate'].fillna('2017-10-10T12:00:00+00:00')

In [7]:
df.isnull().sum()

url                     0
company_name            0
geo                     0
JobTitle                0
JobText                 0
JobLocation           616
JobPostDate             0
last_date_to_apply      0
dtype: int64

In [8]:
df['JobLocation'].value_counts().head()

US-FL-Orlando                                 45
California-Los Gatos-Terraces of Los Gatos    14
US-FL-Altamonte Springs                       13
Multiple Locations                            12
Washington DC Metro Area                      11
Name: JobLocation, dtype: int64

In [9]:
df['JobLocation'] = df['JobLocation'].fillna('Multiple Locations')

In [10]:
df['JobText'] = df['JobText'].fillna('A good job to start with ,requires adequate experience')

In [11]:
df.isnull().sum()

url                   0
company_name          0
geo                   0
JobTitle              0
JobText               0
JobLocation           0
JobPostDate           0
last_date_to_apply    0
dtype: int64

In [12]:
  """Now I will create a function that returns most appearing items in a column"""
def most_common(column,df=df):
    """This function returns top ten appearing elements in a column"""
    print(df[column].value_counts().head(20))

In [13]:
most_common('company_name')
#Amazon is the leading company ,interms of Job Applications

Amazon.com                    590
Adventist Health System       106
Airgas Inc.                    98
AllianceData                   68
ABHOW                          39
Advantage SCI                  17
Acelero Learning               16
Adient                         16
21Tech                         12
Addx                           11
Aimco                           9
Aero Communications Inc         6
AAR Corp.                       4
Alere                           4
ALTOUR                          2
Ace Mart Restaurant Supply      1
Air Canada                      1
Name: company_name, dtype: int64


In [14]:
# Now for all the columns,first drop url since it is not of much help in our recommender
df.drop(['url'],axis=1,inplace=True)

In [15]:
amazon = df[df['company_name']=='Amazon.com'].value_counts()
most_common('company_name')

Amazon.com                    590
Adventist Health System       106
Airgas Inc.                    98
AllianceData                   68
ABHOW                          39
Advantage SCI                  17
Acelero Learning               16
Adient                         16
21Tech                         12
Addx                           11
Aimco                           9
Aero Communications Inc         6
AAR Corp.                       4
Alere                           4
ALTOUR                          2
Ace Mart Restaurant Supply      1
Air Canada                      1
Name: company_name, dtype: int64


In [16]:
amazon = dict(df[df['company_name']=='Amazon.com']['company_name'].value_counts())
Matson= dict(df[df['company_name']=='MATSON NAVIGATION COMPANY, INC.']['company_name'].value_counts())
Northshore = dict(df[df['company_name']=='North Shore Medical Center ']['company_name'].value_counts())
photogenic = dict(df[df['company_name']=='Photogenic Inc.']['company_name'].value_counts())
Petco = dict(df[df['company_name']=='Petco']['company_name'].value_counts())
Mondelez = dict(df[df['company_name']=='Petco']['company_name'].value_counts())
Marriot =dict(Petco = df[df['company_name']=='Mariot']['company_name'].value_counts())
companies =[amazon,Matson,Northshore,photogenic,Petco,Mondelez,Marriot]


In [17]:
df.columns

Index(['company_name', 'geo', 'JobTitle', 'JobText', 'JobLocation',
       'JobPostDate', 'last_date_to_apply'],
      dtype='object')

In [18]:
df.drop(['last_date_to_apply'],axis=1,inplace=True)

In [19]:
df.columns

Index(['company_name', 'geo', 'JobTitle', 'JobText', 'JobLocation',
       'JobPostDate'],
      dtype='object')

In [20]:
most_common('JobTitle')

Software Development Engineer                                  23
Yard Specialist – Weekend Part Time – 16 Hours                 22
Product Design Engineer                                        13
Control Systems Technician                                     12
Controls Engineer                                              12
Sr. Human Resources Assistant                                  11
Maintenance Technician II - Military Veterans                  11
Current Acelero Employees, Please Apply From the Link Below    10
Safety Specialist                                              10
Maintenance Technician II                                      10
Seattle Hiring Event - Technical Program Manager               10
IT Support Engineer                                             9
Search Openings                                                 9
Global IT Support Specialist                                    9
MS SQL Consultant                                               9
Maintenanc

In [21]:
# A function to get important features
def get_important_features(df):
    important_features = []
    for i in range(0,df.shape[0]):
                   important_features.append(df['company_name'][i]+df['JobTitle'][i]+df['JobText'][i]+df['JobLocation'][i])
    return  important_features
    

In [65]:
df['important_features'] = get_important_features(df)

In [66]:
"""Building a Job Recommendation engine using tfidf  and cosine similarity"""
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix=tfidf.fit_transform(df['important_features'])
tfidf_matrix.shape

(1000, 1514)

In [67]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix)
indices =pd.Series(df.index,index= df['JobTitle'])

In [68]:
my_important_features = df['important_features']

In [84]:

def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    book_indices = [i[0] for i in sim_scores]

    return df['company_name'].iloc[indices]

In [88]:
get_recommendations('MS SQL Consultant')

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()