# Keyword extraction from text

- Phai Phongthiengtham

In [1]:
import os
import re
import json
import pandas as pd

import nltk # requires pip install
from nltk import word_tokenize

In [2]:
# read in dataset
ad_data = pd.read_csv('ad_sample.csv')
ad_data

Unnamed: 0,id,text
0,1,*Job Summary: Reporting to the General Manager...
1,2,*Job Summary:Reporting to the designated super...
2,3,*Job Summary: Reporting to the facility design...


In [3]:
# define lists of keywords
list_keywords = dict()
list_keywords['t_problem_solving'] = ['cognitive', 'research', 'analytical', 'math', 'statistics','problem solving', 'critical thinking']
list_keywords['t_social'] = ['social', 'communication', 'teamwork', 'collaboration', 'negotiation', 'presentation']
list_keywords['t_character'] = ['character', 'energetic', 'multi-tasking','detail oriented', 'multi tasking', 'time management', 'meeting deadlines']
list_keywords['t_writing'] = ['writing']
list_keywords['t_customer_service'] = ['customer', 'sales', 'client', 'patient','customer service']
list_keywords['t_project_management'] = ['project management']
list_keywords['t_people_management'] = ['supervisory', 'leadership', 'mentoring', 'staff','people management']
list_keywords['t_financial'] = ['financial', 'budgeting', 'accounting', 'finance', 'cost']
list_keywords['t_computer'] = ['computer', 'spreadsheets', 'software']

- see https://occupationdata.github.io/
- the list is downloaded from http://occupationdata.github.io/apst_mapping.xlsx

In [4]:
list(list_keywords.keys())

['t_problem_solving',
 't_social',
 't_character',
 't_writing',
 't_customer_service',
 't_project_management',
 't_people_management',
 't_financial',
 't_computer']

In [5]:
def clean_raw_text(text):
    '''
    function to clean raw text and compute length (by number of words).
    '''
    
    words = [w.lower() for w in word_tokenize(str(text)) if len(w)>1] #ignore words with length of 1 or 0
    length_ad = len(words)
    
    clean_text = ' '.join(words)
    
    # replace the following characters with white spaces 
    replace_character = ['(',')','[',']','.',',',':','*']
    
    for character in replace_character:
        clean_text = clean_text.replace(character,' ')
        
    # remove extra white spaces
    clean_text = ' '.join([w for w in re.split(' ', clean_text) if not w ==''])
    
    return clean_text, length_ad

def extract_keyword(clean_text, keywords):
    '''
    function to extract keywords and count number of keywords found.
    '''
    
    regex = re.compile('|'.join(['\\b' + w + '\\b' for w in keywords]))
    
    if re.findall(regex, clean_text): # if found
        keyword_found = re.findall(regex, clean_text)
        n_keyword = len(keyword_found)
    else: # else, return 0
        keyword_found = ['']
        n_keyword = 0
    
    return ','.join(keyword_found), n_keyword

In [6]:
# apply functions to the dataset
for row_index, row in ad_data.iterrows():
    text = row['text']
    clean_text, length_ad = clean_raw_text(text)
    
    ad_data.loc[row_index, 'length_ad'] = int(length_ad)
    ad_data.loc[row_index, 'clean_text'] = clean_text
    
    for t in list(list_keywords.keys()):
        keyword_found, n_keyword = extract_keyword(clean_text, list_keywords[t])
        
        ad_data.loc[row_index, t + '_found'] = keyword_found
        ad_data.loc[row_index, t + '_count'] = int(n_keyword)
        
ad_data

Unnamed: 0,id,text,length_ad,clean_text,t_problem_solving_found,t_problem_solving_count,t_social_found,t_social_count,t_character_found,t_character_count,...,t_customer_service_found,t_customer_service_count,t_project_management_found,t_project_management_count,t_people_management_found,t_people_management_count,t_financial_found,t_financial_count,t_computer_found,t_computer_count
0,1,*Job Summary: Reporting to the General Manager...,353.0,job summary reporting to the general manager o...,,0.0,teamwork,1.0,,0.0,...,"sales,customer,customer,customer,customer,cust...",7.0,,0.0,,0.0,,0.0,,0.0
1,2,*Job Summary:Reporting to the designated super...,569.0,job summary reporting to the designated superv...,,0.0,teamwork,1.0,,0.0,...,"customer,customer,customer,customer",4.0,,0.0,,0.0,,0.0,,0.0
2,3,*Job Summary: Reporting to the facility design...,396.0,job summary reporting to the facility designat...,,0.0,teamwork,1.0,,0.0,...,"customer,customer",2.0,,0.0,,0.0,,0.0,,0.0


In [7]:
# print first ad raw text
ad_data.iloc[0]['text']

'*Job Summary: Reporting to the General Manager or designated manager, plan, implement and coordinate the outside operations and functions of one or more of the following departments: Consignment, Factory, Fleet/Lease, Commercial Account or E-Commerce in accordance with corporate guidelines to ensure maximum dollar sales volume in the most profitable way possible. Perform all duties assigned by the General Manager or designated manager. Must know, practice and ensure that company policies and procedures and state or federal laws are followed at all times. Job Responsibilities and Duties:  1.       Perform duties in compliance with all contractual customer, contractor and supplier agreements. 2.       Assist in assuring the inventory is in the proper designated areas of the lot at all times. Track missing units and help with a physical inventory on a periodic basis.    3.       Assist the Operations Manager. 4.       Operate wrecker as needed. 5.       Basic mechanical duties. 6.   Any 

In [8]:
# print first ad clean text
ad_data.iloc[0]['clean_text']

'job summary reporting to the general manager or designated manager plan implement and coordinate the outside operations and functions of one or more of the following departments consignment factory fleet/lease commercial account or e-commerce in accordance with corporate guidelines to ensure maximum dollar sales volume in the most profitable way possible perform all duties assigned by the general manager or designated manager must know practice and ensure that company policies and procedures and state or federal laws are followed at all times job responsibilities and duties perform duties in compliance with all contractual customer contractor and supplier agreements assist in assuring the inventory is in the proper designated areas of the lot at all times track missing units and help with physical inventory on periodic basis assist the operations manager operate wrecker as needed basic mechanical duties any other duties assigned by the general manager or designated manager general emp