## Import Libraries and Dataset

In [1]:
import pandas as pd
import numpy as np
import datetime

In [2]:
df = pd.read_csv('Hotel_Job_Requirements.csv')

## Shape of Data 

In [3]:
df.shape

(20, 8)

## First five rows of the data 

In [4]:
df.head()

Unnamed: 0,area,job_title,company_name,experience,salary,location_list,desc,key_skills
0,Hotel Jobs,Operations Manager @ Hotel,ABC Groups,3 - 6 years,"? 32,50,000 - 45,00,000 P.A.","Pune, Bengaluru",Job descriptionoversee the entire operations o...,"['back office management', 'Operations Managem..."
1,Hotel Jobs,Operation Manager (Resorts/Hotel),The Lake Resort,5 - ?6 years,"? 25,00,000 - 40,00,000 P.A.",Gurgaon,Job Description: \r\r\n Maintaining positive p...,"['Operations management', 'Hospitality', 'Publ..."
2,Hotel Jobs,Hotel Manager Required in Malta (europe),Horizon services,5 - 7 years,Not Disclosed,United Kingdom (U.K),"Job descriptionRecruiting, training and superv...","['hotel manager', 'hospitality manager', 'hote..."
3,Hotel Jobs,Hotel Manager (germany) - No IELTS Required,Dreamland Pvt Ltd,0 - 4 yr,"? 25,00,000 - 40,00,000 P.A.",Germany,Job descriptionHotel managers are responsible ...,"['assistant restaurant manager', 'hotel genera..."
4,Hotel Jobs,Front Office Manager,Hotel Riptide,0 - 2 years!,Not Disclosed,Pune,Job descriptionDeliver strong functional exper...,"['Leadership', 'Kitchen Manager', 'Hotel Manag..."


## Basic summary of the data 

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 8 columns):
area             20 non-null object
job_title        20 non-null object
company_name     20 non-null object
experience       20 non-null object
salary           20 non-null object
location_list    20 non-null object
desc             20 non-null object
key_skills       20 non-null object
dtypes: object(8)
memory usage: 1.3+ KB


## Number of unique values in each column 

In [6]:
for i in df.columns:
    print(i, " : ", df[i].nunique())

area  :  1
job_title  :  15
company_name  :  14
experience  :  16
salary  :  5
location_list  :  20
desc  :  20
key_skills  :  19


## Lowercase
Converting characters to lowercase

In [7]:
df = df.apply(lambda x: x.astype(str).str.lower())
df.columns = df.columns.str.lower()

In [8]:
df.head()

Unnamed: 0,area,job_title,company_name,experience,salary,location_list,desc,key_skills
0,hotel jobs,operations manager @ hotel,abc groups,3 - 6 years,"? 32,50,000 - 45,00,000 p.a.","pune, bengaluru",job descriptionoversee the entire operations o...,"['back office management', 'operations managem..."
1,hotel jobs,operation manager (resorts/hotel),the lake resort,5 - ?6 years,"? 25,00,000 - 40,00,000 p.a.",gurgaon,job description: \r\r\n maintaining positive p...,"['operations management', 'hospitality', 'publ..."
2,hotel jobs,hotel manager required in malta (europe),horizon services,5 - 7 years,not disclosed,united kingdom (u.k),"job descriptionrecruiting, training and superv...","['hotel manager', 'hospitality manager', 'hote..."
3,hotel jobs,hotel manager (germany) - no ielts required,dreamland pvt ltd,0 - 4 yr,"? 25,00,000 - 40,00,000 p.a.",germany,job descriptionhotel managers are responsible ...,"['assistant restaurant manager', 'hotel genera..."
4,hotel jobs,front office manager,hotel riptide,0 - 2 years!,not disclosed,pune,job descriptiondeliver strong functional exper...,"['leadership', 'kitchen manager', 'hotel manag..."


In [9]:
for i in df.columns:
    print(i, " : ", df[i].nunique())

area  :  1
job_title  :  15
company_name  :  14
experience  :  16
salary  :  5
location_list  :  20
desc  :  19
key_skills  :  19


## Data Cleaning 

### Column job_title 

In [10]:
df[['job_title']]

Unnamed: 0,job_title
0,operations manager @ hotel
1,operation manager (resorts/hotel)
2,hotel manager required in malta (europe)
3,hotel manager (germany) - no ielts required
4,front office manager
5,hotel manager
6,front office duty manager
7,hotel manager
8,hotel manager
9,hotel manager


In [11]:
df['job_title_clean'] = df['job_title'].str.replace(r"[^a-zA-Z\d\_\&]+", " ")

df[['job_title_clean']]

Unnamed: 0,job_title_clean
0,operations manager hotel
1,operation manager resorts hotel
2,hotel manager required in malta europe
3,hotel manager germany no ielts required
4,front office manager
5,hotel manager
6,front office duty manager
7,hotel manager
8,hotel manager
9,hotel manager


In [12]:
stopwords = ['the', 'and', 'or', 'in', 'for', 'of', 'from', 'apply', 'pr', 'to', 'pvt', 'ltd', 'under', 'llp']

In [13]:
for i in stopwords :
    df['job_title_clean'] = df['job_title_clean'].replace(to_replace=r'\b%s\b'%i, value=" ",regex=True)

In [14]:
df[['job_title_clean']]

Unnamed: 0,job_title_clean
0,operations manager hotel
1,operation manager resorts hotel
2,hotel manager required malta europe
3,hotel manager germany no ielts required
4,front office manager
5,hotel manager
6,front office duty manager
7,hotel manager
8,hotel manager
9,hotel manager


### Column company_name 

In [15]:
df[['company_name']]

Unnamed: 0,company_name
0,abc groups
1,the lake resort
2,horizon services
3,dreamland pvt ltd
4,hotel riptide
5,xyz hotel
6,paradise hotel
7,random solutions pvt ltd
8,yes solutions
9,oakbranch hotel chain


In [16]:
df['company_name_clean'] = df['company_name'].str.replace(r"[^a-zA-Z\d\_\&]+", " ")

df[['company_name_clean']]

Unnamed: 0,company_name_clean
0,abc groups
1,the lake resort
2,horizon services
3,dreamland pvt ltd
4,hotel riptide
5,xyz hotel
6,paradise hotel
7,random solutions pvt ltd
8,yes solutions
9,oakbranch hotel chain


In [17]:
for i in stopwords :
    df['company_name_clean'] = df['company_name_clean'].replace(to_replace=r'\b%s\b'%i, value=" ",regex=True)

In [18]:
df[['company_name_clean']]

Unnamed: 0,company_name_clean
0,abc groups
1,lake resort
2,horizon services
3,dreamland
4,hotel riptide
5,xyz hotel
6,paradise hotel
7,random solutions
8,yes solutions
9,oakbranch hotel chain


### column experience 

In [19]:
df[['experience']]

Unnamed: 0,experience
0,3 - 6 years
1,5 - ?6 years
2,5 - 7 years
3,0 - 4 yr
4,0 - 2 years!
5,- 1 years
6,2 - 4 years
7,3 - 5 years
8,2 - 5 years
9,5 - 8 years


In [20]:
df['experience_clean'] = df['experience'].str.replace(r"[^\d\ \-]+", " ")

df[['experience_clean']]

Unnamed: 0,experience_clean
0,3 - 6
1,5 - 6
2,5 - 7
3,0 - 4
4,0 - 2
5,- 1
6,2 - 4
7,3 - 5
8,2 - 5
9,5 - 8


### column salary 

In [21]:
df[['salary']]

Unnamed: 0,salary
0,"? 32,50,000 - 45,00,000 p.a."
1,"? 25,00,000 - 40,00,000 p.a."
2,not disclosed
3,"? 25,00,000 - 40,00,000 p.a."
4,not disclosed
5,not disclosed
6,"? 2,25,000 - 3,00,000 p.a."
7,not disclosed
8,not disclosed
9,"$ 55,000 - 75,000 p.a."


In [22]:
df['salary_clean'] = df['salary'].str.replace(r"[^\d\ \-]+", "")

df[['salary_clean']]

Unnamed: 0,salary_clean
0,3250000 - 4500000
1,2500000 - 4000000
2,
3,2500000 - 4000000
4,
5,
6,225000 - 300000
7,
8,
9,55000 - 75000


### Column location 

In [23]:
df[['location_list']]

Unnamed: 0,location_list
0,"pune, bengaluru"
1,gurgaon
2,united kingdom (u.k)
3,germany
4,pune
5,kanyakumari
6,mumbai
7,"thane, kalyan-dombivali"
8,guwahati
9,pan india


In [24]:
df['location_list_clean'] = df['location_list'].str.replace(r"[^a-zA-Z\_\,]+", " ")

df[['location_list_clean']]

Unnamed: 0,location_list_clean
0,"pune, bengaluru"
1,gurgaon
2,united kingdom u k
3,germany
4,pune
5,kanyakumari
6,mumbai
7,"thane, kalyan dombivali"
8,guwahati
9,pan india


### Column desc 

In [25]:
df['desc'][1]

'job description: \r\r\n maintaining positive public relations and pleasant attitude with all guests and associates \r\r\n manage the administrative staff and train new hires on office policies and procedures.\r\n coordinate and manage communication between guest and staff and follow up to ensure complete service recovery. \r\r\ngood analytical ability and communication skills,  proficient in ms office.'

In [26]:
df['desc_clean'] = df['desc'].str.replace(r"[^a-zA-Z\.\,]+", " ")
df['desc_clean'] = df['desc_clean'].str.replace("job description", "")

df[['desc_clean']]

Unnamed: 0,desc_clean
0,oversee the entire operations of a lodging est...
1,maintaining positive public relations and ple...
2,"recruiting, training and supervising staff.man..."
3,hotel managers are responsible for managing em...
4,"deliver strong functional expertise,creativity..."
5,hiring hotel manager for deluxe hotel in kany...
6,. greets the vip guests of the hotel. as dire...
7,we required urgently hotel or order manager fo...
8,job profile restaurant manger
9,in charge of the hotel operations of the unit...


In [27]:
df['desc_clean'][1]

' maintaining positive public relations and pleasant attitude with all guests and associates manage the administrative staff and train new hires on office policies and procedures. coordinate and manage communication between guest and staff and follow up to ensure complete service recovery. good analytical ability and communication skills, proficient in ms office.'

### column key_skills 

In [28]:
df[['key_skills']]

Unnamed: 0,key_skills
0,"['back office management', 'operations managem..."
1,"['operations management', 'hospitality', 'publ..."
2,"['hotel manager', 'hospitality manager', 'hote..."
3,"['assistant restaurant manager', 'hotel genera..."
4,"['leadership', 'kitchen manager', 'hotel manag..."
5,['interpersonal skills']
6,"['front office staff', 'public relations', 'fr..."
7,"['hotel manager', 'restaurant manager', 'order..."
8,['hotel manager restaurant manager company pro...
9,"['incharge', 'sales', 'performance management'..."


In [29]:
df['key_skills_clean'] = df['key_skills'].str.replace(r"[^a-zA-Z\ \,]+", " ")
df[['key_skills_clean']]

Unnamed: 0,key_skills_clean
0,"back office management , operations management"
1,"operations management , hospitality , publi..."
2,"hotel manager , hospitality manager , hotel..."
3,"assistant restaurant manager , hotel general..."
4,"leadership , kitchen manager , hotel manage..."
5,interpersonal skills
6,"front office staff , public relations , fro..."
7,"hotel manager , restaurant manager , order ..."
8,hotel manager restaurant manager company prof...
9,"incharge , sales , performance management ,..."


## Creation of New Columns 

### Number of Years of Experience 

In [30]:
df[['minimum_year_exp', 'maximum_year_exp']] = df['experience_clean'].str.split('-', expand=True)
df['minimum_year_exp'] = df['minimum_year_exp'].str.strip().replace('', 0).astype('int64')
df['maximum_year_exp'] = df['maximum_year_exp'].str.strip().replace('', 0).astype('int64')

In [31]:
df['number_of_years_exp'] = df['maximum_year_exp'] - df['minimum_year_exp']

df[['minimum_year_exp', 'maximum_year_exp', 'number_of_years_exp']]

Unnamed: 0,minimum_year_exp,maximum_year_exp,number_of_years_exp
0,3,6,3
1,5,6,1
2,5,7,2
3,0,4,4
4,0,2,2
5,0,1,1
6,2,4,2
7,3,5,2
8,2,5,3
9,5,8,3


### Average Salary 

In [32]:
df[['minimum_salary', 'maximum_salary']] = df['salary_clean'].str.split('-', expand=True)
df['minimum_salary'] = df['minimum_salary'].str.strip().replace({'': 0}).astype('float64')
df['maximum_salary'] = df['maximum_salary'].str.strip().replace({'': 0}).astype('float64')

df['average_salary'] = (df['maximum_salary'] - df['minimum_salary'])/2

df[['maximum_salary', 'maximum_salary', 'average_salary']]

Unnamed: 0,maximum_salary,maximum_salary.1,average_salary
0,4500000.0,4500000.0,625000.0
1,4000000.0,4000000.0,750000.0
2,,,
3,4000000.0,4000000.0,750000.0
4,,,
5,,,
6,300000.0,300000.0,37500.0
7,,,
8,,,
9,75000.0,75000.0,10000.0


### Number of Key Skills 

In [33]:
df['number_of_key_skills'] = df['key_skills_clean'].str.split(',').str.len()

df[['key_skills_clean', 'number_of_key_skills']]

Unnamed: 0,key_skills_clean,number_of_key_skills
0,"back office management , operations management",2
1,"operations management , hospitality , publi...",10
2,"hotel manager , hospitality manager , hotel...",7
3,"assistant restaurant manager , hotel general...",11
4,"leadership , kitchen manager , hotel manage...",5
5,interpersonal skills,1
6,"front office staff , public relations , fro...",8
7,"hotel manager , restaurant manager , order ...",5
8,hotel manager restaurant manager company prof...,1
9,"incharge , sales , performance management ,...",9
