In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

path = 'sample_ds/'


In [13]:
df_universities = pd.read_csv(path + 'unis.csv')
df_jobs = pd.read_csv(path + 'jobs.csv')
                              

In [14]:
def clean_structure(text):
    text = re.sub(r'\\\'', "'", text)
    text = re.sub(r'\\\"', '"', text)
    text = re.sub(r'\d+\scredit\(s\)', '', text)  
    text = re.sub(r':', ' ', text) 
    text = re.sub(r'\s+', ' ', text) 
    text = text.lower()  
    return text.strip()


def clean_structure_field(text):
    text = text.strip('[]').split(',')
    return ' '.join([clean_structure(course) for course in text])


In [15]:
# Clean Dataframe
df_universities['structure'] = df_universities['structure'].fillna('')
df_universities['structure'] = df_universities['structure'].apply(lambda x: clean_structure_field(x) if x else '')
df_jobs['skills'] = df_jobs['skills'].fillna('')


In [16]:
job_responsibilities = df_jobs['skills']
university_structures =  df_universities['structure']

In [17]:
# Use Model to Identify Similarities
vectorizer = TfidfVectorizer(stop_words='english')

tfidf_matrix_jobs = vectorizer.fit_transform(job_responsibilities)
tfidf_matrix_universities = vectorizer.transform(university_structures)

similarity_matrix = cosine_similarity(tfidf_matrix_jobs, tfidf_matrix_universities)

def match_jobs_to_universities(similarity_matrix, df_jobs, df_universities):
    matched_universities = []
    for i in range(similarity_matrix.shape[0]):
        most_similar_index = np.argmax(similarity_matrix[i])
        matched_universities.append(df_universities.iloc[most_similar_index])
    return pd.DataFrame(matched_universities).reset_index(drop=True)

# Apply the matching process
df_universities_matched = match_jobs_to_universities(similarity_matrix, df_jobs, df_universities)

# Concatenate the matched university data to df_jobs
df_jobs_with_universities = pd.concat([df_jobs.reset_index(drop=True), df_universities_matched.reset_index(drop=True)], axis=1)

In [18]:
similarity_matrix.max()

np.float64(0.789967634429278)

In [19]:
df_jobs_with_universities

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,tution_2_money,tution_2_type,tuition_price_specification,start_date,ielts_score,structure,academic_req,facts,city,program_url
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,,,,2017-11-13 00:00:00,6.5,'community participation' 'social media for or...,"<section id=""AcademicRequirements""> <h2>Academ...",['Starting in 2017-11-13 00:00:00 You can...,"['Toowoomba', 'Springfield Central']",http://www.mastersportal.eu/studies/50229/grad...
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,6344.0,National,Tuition (Year),2018-02-01 00:00:00,6.5,'introduction to teaching and research in the ...,"<section id=""AcademicRequirements""> <h2>Academ...",['Starting in 2018-02-01 00:00:00 You can...,['Adelaide'],http://www.mastersportal.eu/studies/110288/tea...
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,,,,,6.0,'introduction to quality health services' 'eva...,"<section id=""AcademicRequirements""> <h2>Academ...",['Deadline and start date Applications are ac...,['Hobart'],http://www.mastersportal.eu/studies/80343/qual...
3,688192671473044,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,9.3077,2.3158,Full-Time,129896,...,31680.0,National,Tuition (Year),2018-06-24 00:00:00,6.5,'internet engineering' 'mobile and wireless ne...,"<section id=""AcademicRequirements""> <h2>Academ...",['Starting in 2018-06-24 00:00:00 You can...,['Melbourne'],http://www.mastersportal.eu/studies/74170/tele...
4,117057806156508,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,-35.6751,-71.5429,Intern,53944,...,,,Tuition (Year),2018-02-01 00:00:00,6.5,'content marketing and advertising' 'external ...,"<section id=""AcademicRequirements""> <h2>Academ...",['Starting in 2018-02-01 00:00:00 You can...,['Melbourne'],http://www.mastersportal.eu/studies/62913/mark...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8074,1003521221117999,4 to 9 Years,MCA,$65K-$118K,Nairobi,Kenya,-1.2864,36.8172,Temporary,48819,...,25000.0,National,Tuition (Year),2018-02-27 00:00:00,6.5,'advanced consumer behaviour' 'strategic marke...,"<section id=""AcademicRequirements""> <h2>Academ...",['Starting in 2018-02-27 00:00:00 You can...,['Brisbane'],http://www.mastersportal.eu/studies/62284/mark...
8075,2303241016029011,1 to 14 Years,M.Tech,$59K-$97K,Majuro,Marshall Islands,7.1315,171.1845,Full-Time,115877,...,46604.0,National,Tuition (Full programme),2018-10-31 00:00:00,6.5,'procurement and logistics' 'supply chain anal...,"<section id=""AcademicRequirements""> <h2>Academ...",['Starting in 2018-10-31 00:00:00 You can...,['Melbourne'],http://www.mastersportal.eu/studies/62906/supp...
8076,662454142875177,1 to 14 Years,B.Com,$63K-$82K,Kingstown,St. Vincent and the Grenadines,12.9843,-61.2872,Contract,46282,...,897.0,National,Tuition (Full programme),2018-02-03 00:00:00,,'attendance component a weekend of intensive l...,"<section id=""AcademicRequirements""> <h2>Academ...",['Starting in 2018-02-03 00:00:00 You can...,['Melbourne'],https://www.preparationcoursesportal.com/studi...
8077,2594337392461923,1 to 10 Years,MBA,$60K-$99K,Jakarta,Indonesia,-0.7893,113.9213,Part-Time,129532,...,,,,2018-03-01 00:00:00,6.5,'information security' 'security fundamentals'...,"<section id=""AcademicRequirements""> <h2>Academ...",['Starting in 2018-03-01 00:00:00 You can...,,http://www.mastersportal.eu/studies/157278/inf...


In [20]:
df_jobs_with_universities.to_csv(path + 'merged_ds.csv')