In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [7]:
apps = pd.read_csv('apps.csv')
jobs = pd.read_csv('jobs.csv')
users = pd.read_csv('users.csv')

In [9]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25511 entries, 0 to 25510
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   UserID                25511 non-null  int64  
 1   WindowID              25511 non-null  int64  
 2   Split                 25511 non-null  object 
 3   City                  25511 non-null  object 
 4   State                 25511 non-null  object 
 5   Country               25511 non-null  object 
 6   ZipCode               25475 non-null  object 
 7   DegreeType            22133 non-null  object 
 8   Major                 25511 non-null  object 
 9   GraduationDate        19865 non-null  object 
 10  WorkHistoryCount      25511 non-null  int64  
 11  TotalYearsExperience  25511 non-null  float64
 12  CurrentlyEmployed     25511 non-null  object 
 13  ManagedOthers         25511 non-null  object 
 14  ManagedHowMany        25511 non-null  int64  
dtypes: float64(1), int6

In [10]:
users['DegreeType'] = users['DegreeType'].fillna('')
users['Major'] = users['Major'].fillna('')
users['TotalYearsExperience'] = users['TotalYearsExperience'].fillna('').astype(str)

users['DegreeType'] = users['DegreeType'] + ' ' + users['Major'] + ' ' + users['TotalYearsExperience']


In [11]:
users.head()

Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,13,6,Test,Philadelphia,PA,US,19143,Bachelor's Psychological & Social Sciences 5.0,Psychological & Social Sciences,2011-12-01 00:00:00,6,5.0,Yes,No,0
1,64,6,Train,Columbus,OH,US,43230,Master's Business Administration and Managemen...,"Business Administration and Management, General",2011-12-01 00:00:00,3,22.0,Yes,No,0
2,101,6,Train,Brick,NJ,US,8724,High School Not Applicable 2.0,Not Applicable,,1,2.0,No,Yes,4
3,133,6,Train,Wilmington,DE,US,19802,Bachelor's Sports Management 9.0,Sports Management,2003-05-01 00:00:00,6,9.0,Yes,Yes,6
4,151,6,Train,Norwich,CT,US,6360,Business 24.0,Business,1987-01-01 00:00:00,4,24.0,No,Yes,10


In [12]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.0, stop_words='english')
tfidf_matrix = tf.fit_transform(users['DegreeType'])

In [13]:
tfidf_matrix.shape

(25511, 13013)

In [14]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [15]:
cosine_sim.shape

(25511, 25511)

In [24]:
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 1., 1., 0.],
       ...,
       [0., 0., 1., ..., 1., 1., 0.],
       [0., 0., 1., ..., 1., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [16]:
user_based_approach = users.reset_index()
userid = user_based_approach['UserID']
indices = pd.Series(user_based_approach.index, index=user_based_approach['UserID'])

In [17]:
def get_recommendations_userwise(userid):
    idx = indices[userid]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    user_indices = [i[0] for i in sim_scores]
    user_indices_top = user_indices[1:12]
    similar_users = list(user_based_approach.loc[indices.iloc[user_indices_top]]['UserID'].values)
    return similar_users

In [25]:
print ("-----Top 10 Similar users with userId: 554------")
get_recommendations_userwise(13)

-----Top 10 Similar users with userId: 554------


[363411,
 1296070,
 465442,
 1121264,
 109317,
 1199131,
 56406,
 1209293,
 1299650,
 1306730,
 695542]

In [26]:
apps['UserID'].unique()

array([     13,      64,     101, ..., 1471948, 1472019, 1472066])

In [27]:
def get_job_id(usrid_list):
    jobs_userwise = apps['UserID'].isin(usrid_list) #
    df1 = pd.DataFrame(data = apps[jobs_userwise], columns=['JobID'])
    joblist = df1['JobID'].tolist()
    Job_list = jobs['JobID'].isin(joblist) #[1083186, 516837, 507614, 754917, 686406, 1058896, 335132])
    df_temp = pd.DataFrame(data = jobs[Job_list], columns=['JobID','Title','Description','City','State'])
    return df_temp

In [29]:
get_job_id(get_recommendations_userwise(13))

Unnamed: 0,JobID,Title,Description,City,State
1835,17497,Community Living Specialist - Direct Care Posi...,,Wayne,NJ
5347,51907,Restaurant Team Member - Crew,,Chevy Chase,MD
6007,58250,Physical Therapist - IU Health Morgan Hospital,,Bethany,IN
6545,63095,Human Capital Principal Financial Serv Emp Ben...,,Glen Allen,VA
11901,111105,Program Administrative Coordinator,,Covina,CA
17311,162856,Store Manager Off-site,,Rialto,CA
22560,216510,$$55K Receptionist - Flexible Hours,,San Marcos,CA
23084,221335,Youth Counselor Direct Care Residential,,Fitchburg,MA
26242,253478,"Specialist, Walk (Service Areas: AL & FL)",,Pensacola,FL
26401,255080,State Farm Insurance and Financial Services Agent,,Council Bluffs,IA
