In [1]:
import pandas as pd
import numpy as np
import random
import scipy.sparse
from clean import clean_txt
from sklearn.feature_extraction.text import TfidfVectorizer

## Đọc dữ liệu

In [2]:
combined_job_df = pd.read_csv('./data/Combined_Jobs_Final.csv')
combined_job_df.head(2)

Unnamed: 0,Job.ID,Provider,Status,Slug,Title,Position,Company,City,State.Name,State.Code,...,Industry,Job.Description,Requirements,Salary,Listing.Start,Listing.End,Employment.Type,Education.Required,Created.At,Updated.At
0,111,1,open,palo-alto-ca-tacolicious-server,Server @ Tacolicious,Server,Tacolicious,Palo Alto,California,CA,...,Food and Beverages,Tacolicious' first Palo Alto store just opened...,,8.0,,,Part-Time,,2013-03-12 02:08:28 UTC,2014-08-16 15:35:36 UTC
1,113,1,open,san-francisco-ca-claude-lane-kitchen-staff-chef,Kitchen Staff/Chef @ Claude Lane,Kitchen Staff/Chef,Claude Lane,San Francisco,California,CA,...,Food and Beverages,\r\n\r\nNew French Brasserie in S.F. Financia...,,0.0,,,Part-Time,,2013-04-12 08:36:36 UTC,2014-08-16 15:35:36 UTC


In [3]:
combined_job_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84090 entries, 0 to 84089
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Job.ID              84090 non-null  int64  
 1   Provider            84090 non-null  int64  
 2   Status              84090 non-null  object 
 3   Slug                84090 non-null  object 
 4   Title               84090 non-null  object 
 5   Position            84090 non-null  object 
 6   Company             81819 non-null  object 
 7   City                83955 non-null  object 
 8   State.Name          83919 non-null  object 
 9   State.Code          83919 non-null  object 
 10  Address             36 non-null     object 
 11  Latitude            84090 non-null  float64
 12  Longitude           84090 non-null  float64
 13  Industry            267 non-null    object 
 14  Job.Description     84034 non-null  object 
 15  Requirements        0 non-null      float64
 16  Sala

In [4]:
combined_job_df.isna().sum()

Job.ID                    0
Provider                  0
Status                    0
Slug                      0
Title                     0
Position                  0
Company                2271
City                    135
State.Name              171
State.Code              171
Address               84054
Latitude                  0
Longitude                 0
Industry              83823
Job.Description          56
Requirements          84090
Salary                83861
Listing.Start           683
Listing.End             167
Employment.Type          10
Education.Required      267
Created.At                0
Updated.At                0
dtype: int64

## Tiền xử lý

In [5]:
cols = ['Job.ID', 'Title', 'Position', 'Company', 'City', 'Job.Description', 'Employment.Type']
df_jobs = combined_job_df[cols]

In [6]:
df_jobs_nan_city = df_jobs[df_jobs['City'].isna()]
df_jobs_nan_city.head()

Unnamed: 0,Job.ID,Title,Position,Company,City,Job.Description,Employment.Type
3425,142044,Registered Nurse (RN) @ St. Francis Hospital,Registered Nurse (RN),St. Francis Hospital,,Under the direction of the Clinical Coordinato...,Full-Time/Part-Time
3433,142054,Sales Representative - Business Development Op...,Sales Representative - Business Development Op...,CHI Payment Systems,,"If you&rsquo;re energetic, motivated, hardwork...",Full-Time/Part-Time
3434,142055,New Business Executive @ CHI Payment Systems,New Business Executive,CHI Payment Systems,,"If you&rsquo;re energetic, motivated, hardwork...",Full-Time/Part-Time
3435,142056,Outside Sales Representative (Business Develop...,Outside Sales Representative (Business Develop...,CHI Payment Systems,,"If you&rsquo;re energetic, motivated, hardwork...",Full-Time/Part-Time
3436,142057,Outside Sales Representative @ CHI Payment Sys...,Outside Sales Representative,CHI Payment Systems,,"If you&rsquo;re energetic, motivated, hardwork...",Full-Time/Part-Time


In [7]:
companies_with_nan_city = df_jobs_nan_city['Company'].unique().tolist()
companies_with_nan_city

['St. Francis Hospital',
 'CHI Payment Systems',
 'Genesis Health Systems',
 'Driveline Retail',
 'Volvo Group',
 'Home Instead Senior Care',
 'Genesis Health System',
 'Academic Year In America',
 'Educational Testing Services',
 'CBS Healthcare Services and Staffing']

In [8]:
## fill city NaN value bằng cách tra tên thành phố với nơi công ty tuyển dụng tương ứng
df_jobs = df_jobs.copy()
df_jobs['Company'] = df_jobs['Company'].replace(['Genesis Health Systems'], 'Genesis Health System')
df_jobs.loc[df_jobs.Company == 'CHI Payment Systems', 'City'] = 'Illinois'
df_jobs.loc[df_jobs.Company == 'Academic Year In America', 'City'] = 'Stamford'
df_jobs.loc[df_jobs.Company == 'CBS Healthcare Services and Staffing ', 'City'] = 'Urbandale'
df_jobs.loc[df_jobs.Company == 'Driveline Retail', 'City'] = 'Coppell'
df_jobs.loc[df_jobs.Company == 'Educational Testing Services', 'City'] = 'New Jersey'
df_jobs.loc[df_jobs.Company == 'Genesis Health System', 'City'] = 'Davennport'
df_jobs.loc[df_jobs.Company == 'Home Instead Senior Care', 'City'] = 'Nebraska'
df_jobs.loc[df_jobs.Company == 'St. Francis Hospital', 'City'] = 'New York'
df_jobs.loc[df_jobs.Company == 'Volvo Group', 'City'] = 'Washington'
df_jobs.loc[df_jobs.Company == 'CBS Healthcare Services and Staffing', 'City'] = 'Urbandale'

In [9]:
df_jobs['Employment.Type'].value_counts()

Part-Time              33228
Seasonal/Temp          27609
Full-Time/Part-Time    17626
Per Diem                4644
Intern                   921
Full-Time                 37
Contract                  14
Temporary/seasonal         1
Name: Employment.Type, dtype: int64

In [10]:
df_jobs['Employment.Type'].fillna('Full-Time/Part-Time', inplace=True)
df_jobs['Job.Description'].fillna('', inplace=True)
df_jobs['Company'].fillna('', inplace=True)
df_jobs['text'] = df_jobs["Position"] + " " + df_jobs["Company"] + " " + df_jobs["City"] + " " + df_jobs['Employment.Type']+ " " + df_jobs['Job.Description']
df_jobs_2 = df_jobs.copy()
df_jobs_2 = df_jobs_2[['Job.ID', 'text', 'Title']]
df_jobs_2.isna().sum()

Job.ID    0
text      0
Title     0
dtype: int64

In [11]:
df_jobs_2

Unnamed: 0,Job.ID,text,Title
0,111,Server Tacolicious Palo Alto Part-Time Tacolic...,Server @ Tacolicious
1,113,Kitchen Staff/Chef Claude Lane San Francisco P...,Kitchen Staff/Chef @ Claude Lane
2,117,Bartender Machka Restaurants Corp. San Francis...,Bartender @ Machka Restaurants Corp.
3,121,Server Teriyaki House Brisbane Part-Time ● Se...,Server @ Teriyaki House
4,127,Kitchen Staff/Chef Rosa Mexicano - Sunset Los ...,Kitchen Staff/Chef @ Rosa Mexicano - Sunset
...,...,...,...
84085,82,Book Keeper National Japanese American Histori...,Book Keeper @ National Japanese American Histo...
84086,83,Kitchen Staff/Chef Emporio Rulli Larkspur Part...,Kitchen Staff/Chef @ Emporio Rulli
84087,84,Driver Onigilly San Francisco Part-Time ONIGIL...,Driver @ Onigilly
84088,88,Line Cook Machka Restaurants Corp. San Francis...,Line Cook @ Machka Restaurants Corp.


In [12]:
final_df_jobs = df_jobs_2.copy()
final_df_jobs['text'] = final_df_jobs['text'].apply(clean_txt)

In [13]:
final_df_jobs.to_csv('./clean_data/final_df_jobs.csv', index=False)
final_df_jobs

Unnamed: 0,Job.ID,text,Title
0,111,server tacolicious palo alto part time tacolic...,Server @ Tacolicious
1,113,kitchen staff chef claude lane san francisco p...,Kitchen Staff/Chef @ Claude Lane
2,117,bartender machka restaurants corp san francisc...,Bartender @ Machka Restaurants Corp.
3,121,server teriyaki house brisbane part time serve...,Server @ Teriyaki House
4,127,kitchen staff chef rosa mexicano sunset los an...,Kitchen Staff/Chef @ Rosa Mexicano - Sunset
...,...,...,...
84085,82,book keeper national japanese american histori...,Book Keeper @ National Japanese American Histo...
84086,83,kitchen staff chef emporio rulli larkspur part...,Kitchen Staff/Chef @ Emporio Rulli
84087,84,driver onigilly san francisco part time onigil...,Driver @ Onigilly
84088,88,line cook machka restaurants corp san francisc...,Line Cook @ Machka Restaurants Corp.


In [14]:
i = random.randint(0, len(final_df_jobs))
print(f'Original text: \n{df_jobs_2["text"][i]}')
print(f'Cleaned text: \n{final_df_jobs["text"][i]}')

Original text: 
Front Desk Receptionist - English & Spanish OfficeTeam Miami Seasonal/Temp Ref ID: 01130-9732299Classification: Office/Admin Supervisor/MgrCompensation: DOESeeking front desk receptionist who is able to assist with general office duties. Including answering busy phones, providing excellent customer service, filing, scanning, etc. Please respond with a copy of your resume [Click Here to Email Your Resumé]
Cleaned text: 
front desk receptionist english spanish officeteam miami seasonal temp ref classification office admin supervisor mgrcompensation doeseeking front desk receptionist able assist general office duties include answer busy phone provide excellent customer service file scan etc please respond copy resume click email resumé
