-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean_job_title.py
88 lines (60 loc) · 2.65 KB
/
clean_job_title.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import enchant
import nltk
from nltk.corpus import words
import re
import html
import re
dict_en = enchant.Dict("en_US")
def remove_special_character(word):
result = ''.join(e for e in word if e.isalnum() or e == ' ')
result = re.sub('[!@#$%^&*()]', '', result)
return result
def remove_after_character(word, special_character):
below_charactor = word.find(special_character)
if below_charactor != -1:
return suggest_title[:below_charactor]
return word
def add_suggest_word_to_dict(original_word, suggest_word):
""" Add a new word to dictionary. Dictionary will be save at file 'word_not_in_en.csv'
Input:
Original_word: the word need to replace. Ex: Dev., Sr.
Suggeest_word: The word use to replace the wrong word: Ex: Developer, Senoir
"""
words_not_in_en = pd.read_csv("words_not_in_en.csv")
words_not_in_en = words_not_in_en.drop('Unnamed: 0', 1)
word = str.lower(original_word)
new_words = words_not_in_en[words_not_in_en.word == word].head(1)
frequency = 0
if len(new_words) != 0:
frequency = words_not_in_en.get_value(new_words.index[0], 'frequency') + 1
words_not_in_en.set_value(new_words.index[0], 'frequency', frequency)
words_not_in_en.set_value(new_words.index[0], 'score', 200)
words_not_in_en.set_value(new_words.index[0], 'suggest_word', suggest_word)
words_not_in_en.to_csv("words_not_in_en_2.csv")
def standard_job_title(original_job_title, suggest_words_dict):
job_title_lower_case = str.lower(original_job_title)
job_title_remove_unicode = html.unescape(job_title_lower_case)
suggest_title = job_title_remove_unicode
original_title = remove_special_character(job_title_remove_unicode)
# Remove all word between "(" and ")"
open_bracket = suggest_title.find("(")
close_bracket = suggest_title.find(")")
if open_bracket != -1 and close_bracket != -1:
suggest_title = suggest_title[:open_bracket] + suggest_title[(close_bracket + 1):]
# Remove all word after "/", or "-"
suggest_title = remove_after_character(suggest_title, "/")
suggest_title = remove_after_character(suggest_title, "-")
for word in original_title.split():
wrong_words = suggest_words_dict[suggest_words_dict.word == word].head(1)
if (len(wrong_words) == 1):
suggest_word = wrong_words.get_value(wrong_words.index[0], 'suggest_word')
wrong_word = wrong_words.get_value(wrong_words.index[0], 'word')
score = wrong_words.get_value(wrong_words.index[0], 'score')
if score >= 95:
suggest_title = suggest_title.replace(wrong_word, suggest_word)
score = fuzz.ratio(original_title,suggest_title)
return (suggest_title, score)