# Importing packages and data

In [1]:
import pandas as pd
import numpy as np
import string

In [2]:
columns_to_drop = ['index', 'Easy Apply', 'Competitors', 'Revenue', 'Type of ownership', 'Rating']
data = pd.read_csv('Data Scientist Jobs/DataScientist.csv', index_col = 0).drop(columns_to_drop, axis=1)
data

Unnamed: 0,Job Title,Salary Estimate,Job Description,Company Name,Location,Headquarters,Size,Founded,Industry,Sector
0,Senior Data Scientist,$111K-$181K (Glassdoor est.),"ABOUT HOPPER\n\nAt Hopper, we’re on a mission ...",Hopper\n3.5,"New York, NY","Montreal, Canada",501 to 1000 employees,2007,Travel Agencies,Travel & Tourism
1,"Data Scientist, Product Analytics",$111K-$181K (Glassdoor est.),"At Noom, we use scientifically proven methods ...",Noom US\n4.5,"New York, NY","New York, NY",1001 to 5000 employees,2008,"Health, Beauty, & Fitness",Consumer Services
2,Data Science Manager,$111K-$181K (Glassdoor est.),Decode_M\n\nhttps://www.decode-m.com/\n\nData ...,Decode_M,"New York, NY","New York, NY",1 to 50 employees,-1,-1,-1
3,Data Analyst,$111K-$181K (Glassdoor est.),Sapphire Digital seeks a dynamic and driven mi...,Sapphire Digital\n3.4,"Lyndhurst, NJ","Lyndhurst, NJ",201 to 500 employees,2019,Internet,Information Technology
4,"Director, Data Science",$111K-$181K (Glassdoor est.),"Director, Data Science - (200537)\nDescription...",United Entertainment Group\n3.4,"New York, NY","New York, NY",51 to 200 employees,2007,Advertising & Marketing,Business Services
...,...,...,...,...,...,...,...,...,...,...
3904,AWS Data Engineer,$55K-$112K (Glassdoor est.),About Us\n\nTachyon Technologies is a Digital ...,Tachyon Technologies\n4.4,"Dublin, OH","Irving, TX",201 to 500 employees,2011,IT Services,Information Technology
3905,Data Analyst â Junior,$55K-$112K (Glassdoor est.),"Job description\nInterpret data, analyze resul...","Staffigo Technical Services, LLC\n5.0","Columbus, OH","Woodridge, IL",51 to 200 employees,2008,IT Services,Information Technology
3906,Security Analytics Data Engineer,$55K-$112K (Glassdoor est.),Job DescriptionThe Security Analytics Data Eng...,"PDS Tech, Inc.\n3.8","Dublin, OH","Irving, TX",5001 to 10000 employees,1977,Staffing & Outsourcing,Business Services
3907,Security Analytics Data Engineer,$55K-$112K (Glassdoor est.),The Security Analytics Data Engineer will inte...,Data Resource Technologies\n4.0,"Dublin, OH","Omaha, NE",1 to 50 employees,-1,Accounting,Accounting & Legal


When exploring the dataset I saw that there is too much long descriptions and for computation reasons I want to keep only short descriptions 

In [3]:
# Keeping only descriptions with <=500 words
indexes = []
for index in data.index:
    if len(data.loc[index, 'Job Description'].split())<=350:
        indexes.append(index)

data = data.loc[indexes]
data

Unnamed: 0,Job Title,Salary Estimate,Job Description,Company Name,Location,Headquarters,Size,Founded,Industry,Sector
7,Quantitative Research Associate,$111K-$181K (Glassdoor est.),Seeking a quant to work with senior researcher...,Enlightenment Research,"New York, NY","New York, NY",1 to 50 employees,-1,-1,-1
8,AI Scientist,$111K-$181K (Glassdoor est.),Paige is a software company helping pathologis...,Paige\n5.0,"New York, NY","New York, NY",1 to 50 employees,2018,Enterprise Software & Network Solutions,Information Technology
9,Quantitative Researcher,$111K-$181K (Glassdoor est.),"About the Position\n\n\nAt Jane Street, we con...",Jane Street\n4.8,"New York, NY","New York, NY",501 to 1000 employees,2000,Investment Banking & Asset Management,Finance
19,"VP, Data Science",$111K-$181K (Glassdoor est.),"We are looking for a VP, Data Science to lead ...",7Park Data\n3.9,"New York, NY","New York, NY",51 to 200 employees,2012,Research & Development,Business Services
22,Quantitative Researcher – Intern (US),$111K-$181K (Glassdoor est.),Job Description\n\nQuantitative Researchers at...,Citadel Securities\n4.1,"New York, NY","Chicago, IL",201 to 500 employees,2002,Brokerage Services,Finance
...,...,...,...,...,...,...,...,...,...,...
3897,Senior Data Engineer,$55K-$112K (Glassdoor est.),Job Responsibility:\nBased on business strateg...,Kognetics\n3.6,"Gahanna, OH","New York, NY",1 to 50 employees,-1,-1,-1
3901,Principal Data Engineer,$55K-$112K (Glassdoor est.),ComResource is looking for a Principal Data En...,ComResource\n4.6,"Columbus, OH","Columbus, OH",201 to 500 employees,1991,IT Services,Information Technology
3902,Columbus Data Science Tutor Jobs,$55K-$112K (Glassdoor est.),#Columbus Data Science Tutor Jobs\n\nVarsity T...,Varsity Tutors\n4.2,"Columbus, OH","Saint Louis, MO",201 to 500 employees,2007,Internet,Information Technology
3904,AWS Data Engineer,$55K-$112K (Glassdoor est.),About Us\n\nTachyon Technologies is a Digital ...,Tachyon Technologies\n4.4,"Dublin, OH","Irving, TX",201 to 500 employees,2011,IT Services,Information Technology


I can see that the `-1` value represent the null value

In [4]:
data.replace('-1', np.nan, inplace=True)
data.replace(-1, np.nan, inplace=True)

In [5]:
data.isna().sum()

Job Title            0
Salary Estimate      0
Job Description      0
Company Name         0
Location             0
Headquarters       122
Size               117
Founded            482
Industry           268
Sector             268
dtype: int64

In [6]:
data['Founded'] = data['Founded'].fillna(data['Founded'].max())
data['Headquarters'] = data['Headquarters'].fillna(data['Location'])

In [7]:
# Check how much rows has at least one null value
len(data) - len(data.dropna())

268

In [8]:
# Keeping only samples with no null values to create the best prompts
data.dropna(inplace=True)
data.shape

(1028, 10)

I noticed that there is some anomalies in the `Company Name` and `Salary Estimate` columns

In [9]:
data['Company Name'] = data['Company Name'].str.replace('\n', '').str.translate(str.maketrans('', '', string.digits))
data['Salary Estimate'] = data['Salary Estimate'].str.split('(').str.get(0)
data

Unnamed: 0,Job Title,Salary Estimate,Job Description,Company Name,Location,Headquarters,Size,Founded,Industry,Sector
8,AI Scientist,$111K-$181K,Paige is a software company helping pathologis...,Paige.,"New York, NY","New York, NY",1 to 50 employees,2018.0,Enterprise Software & Network Solutions,Information Technology
9,Quantitative Researcher,$111K-$181K,"About the Position\n\n\nAt Jane Street, we con...",Jane Street.,"New York, NY","New York, NY",501 to 1000 employees,2000.0,Investment Banking & Asset Management,Finance
19,"VP, Data Science",$111K-$181K,"We are looking for a VP, Data Science to lead ...",Park Data.,"New York, NY","New York, NY",51 to 200 employees,2012.0,Research & Development,Business Services
22,Quantitative Researcher – Intern (US),$111K-$181K,Job Description\n\nQuantitative Researchers at...,Citadel Securities.,"New York, NY","Chicago, IL",201 to 500 employees,2002.0,Brokerage Services,Finance
23,Senior Data Engineer (Healthcare Domain experi...,$111K-$181K,"Key Responsibilities\n\n- Architect, build, an...",Enterprise Integration.,"New York, NY","Jacksonville, FL",51 to 200 employees,1998.0,IT Services,Information Technology
...,...,...,...,...,...,...,...,...,...,...
3894,"JPSC-7975 - Data Analyst Lead- Columbus, OH (L...",$55K-$112K,Overview\n\nRole: Data Analyst Lead – Informat...,Avani Technology Solutions.,"Columbus, OH","Rochester, NY",501 to 1000 employees,2008.0,IT Services,Information Technology
3901,Principal Data Engineer,$55K-$112K,ComResource is looking for a Principal Data En...,ComResource.,"Columbus, OH","Columbus, OH",201 to 500 employees,1991.0,IT Services,Information Technology
3902,Columbus Data Science Tutor Jobs,$55K-$112K,#Columbus Data Science Tutor Jobs\n\nVarsity T...,Varsity Tutors.,"Columbus, OH","Saint Louis, MO",201 to 500 employees,2007.0,Internet,Information Technology
3904,AWS Data Engineer,$55K-$112K,About Us\n\nTachyon Technologies is a Digital ...,Tachyon Technologies.,"Dublin, OH","Irving, TX",201 to 500 employees,2011.0,IT Services,Information Technology


In [10]:
data.columns

Index(['Job Title', 'Salary Estimate', 'Job Description', 'Company Name',
       'Location', 'Headquarters', 'Size', 'Founded', 'Industry', 'Sector'],
      dtype='object')

# Prompt Engineering
For modeling reasons I opted for engineering prompts which the answers are the descriptions

In [11]:
# Example of feature engineering
job_title = data.iloc[1]['Job Title']
company_name = data.iloc[1]['Company Name']
year = data.iloc[1]['Founded']
size = data.iloc[1]['Size']
industry = data.iloc[1]['Industry']
sector = data.iloc[1]['Sector']
location = data.iloc[1]['Location']
headquarter = data.iloc[1]['Headquarters']
salary_estimate = data.iloc[1]['Salary Estimate']
print('The prompt:')
print(f'Write a job description for {job_title} role proposed by "{company_name}", a company founded on {year} that has now a size of {size} and works on {industry} industry in the {sector} sector. The job proposed is in {location} meanwhile the headquarter of the company is in {headquarter}. The salary estimate is {salary_estimate}')
print('-'*127)
print('')
print('The job description:')
print(data.iloc[1]['Job Description'])

The prompt:
Write a job description for Quantitative Researcher role proposed by "Jane Street.", a company founded on 2000.0 that has now a size of 501 to 1000 employees and works on Investment Banking & Asset Management industry in the Finance sector. The job proposed is in New York, NY meanwhile the headquarter of the company is in New York, NY. The salary estimate is $111K-$181K 
-------------------------------------------------------------------------------------------------------------------------------

The job description:
About the Position


At Jane Street, we consider trading and programming to be two ends of a continuum. As both a trading firm and a tech firm, we have room for people who love to trade, people who love to program, and people everywhere in between. Nearly all of our traders write code, and many of our developers trade. The role you carve out for yourself will be largely dependent on your strengths and the types of problems you enjoy thinking about.

Researcher

In [12]:
# Creating prompt for every sample
data['Prompt'] = [' ' for i in range(len(data))]
for index in data.index:
    job_title = data.loc[index,'Job Title']
    company_name = data.loc[index,'Company Name']
    year = data.loc[index,'Founded']
    size = data.loc[index,'Size']
    industry = data.loc[index,'Industry']
    sector = data.loc[index,'Sector']
    location = data.loc[index,'Location']
    headquarter = data.loc[index,'Headquarters']
    salary_estimate = data.loc[index,'Salary Estimate']
    data.loc[index, 'Prompt'] = f'Write a job description for {job_title} role proposed by "{company_name}", a company founded on {year} that has now a size of {size} and works on {industry} industry in the {sector} sector. The job proposed is in {location} meanwhile the headquarter of the company is in {headquarter}. The salary estimate is {salary_estimate}'
    
data['Prompt']

8       Write a job description for AI Scientist role ...
9       Write a job description for Quantitative Resea...
19      Write a job description for VP, Data Science r...
22      Write a job description for Quantitative Resea...
23      Write a job description for Senior Data Engine...
                              ...                        
3894    Write a job description for JPSC-7975 - Data A...
3901    Write a job description for Principal Data Eng...
3902    Write a job description for Columbus Data Scie...
3904    Write a job description for AWS Data Engineer ...
3905    Write a job description for Data Analyst â Jun...
Name: Prompt, Length: 1028, dtype: object

In [13]:
# Modeling my dataframe into prompts and targets
data['Target'] = data['Job Description']
data = data[['Prompt', 'Target']]
data

Unnamed: 0,Prompt,Target
8,Write a job description for AI Scientist role ...,Paige is a software company helping pathologis...
9,Write a job description for Quantitative Resea...,"About the Position\n\n\nAt Jane Street, we con..."
19,"Write a job description for VP, Data Science r...","We are looking for a VP, Data Science to lead ..."
22,Write a job description for Quantitative Resea...,Job Description\n\nQuantitative Researchers at...
23,Write a job description for Senior Data Engine...,"Key Responsibilities\n\n- Architect, build, an..."
...,...,...
3894,Write a job description for JPSC-7975 - Data A...,Overview\n\nRole: Data Analyst Lead – Informat...
3901,Write a job description for Principal Data Eng...,ComResource is looking for a Principal Data En...
3902,Write a job description for Columbus Data Scie...,#Columbus Data Science Tutor Jobs\n\nVarsity T...
3904,Write a job description for AWS Data Engineer ...,About Us\n\nTachyon Technologies is a Digital ...


In [14]:
data.to_csv('Data Scientist Jobs/Processed_data.csv', index=False)