# NLP - IF-IDF - Document Classifier Model
Goal: Classify Career levels

In [149]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
import re
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression


In [138]:
def filter_location(str):
    pattern = r"\,\s[A-Z]{2}$"
    result = re.findall(pattern,str)
    if len(result) == 1:
        return result[0][2:]
    else: 
        return str
filter_location("Boston, MA")

'MA'

In [139]:
# Làm gọn location
data = pd.read_excel("../datasets/final_project.ods", engine='odf',dtype="string")
data['location'] = data['location'].apply(filter_location)
data.head()


Unnamed: 0,title,location,description,function,industry,career_level
0,Technical Professional Lead - Process,TX,"Responsible for the study, design, and specifi...",production_manufacturing,Machinery and Industrial Facilities Engineering,senior_specialist_or_project_manager
1,Cnslt - Systems Eng- Midrange 1,WA,"Participates in design, development and implem...",information_technology_telecommunications,Financial Services,senior_specialist_or_project_manager
2,SharePoint Developers and Solution Architects,TX,We are currently in need of Developers who can...,consulting,IT Consulting,senior_specialist_or_project_manager
3,Business Information Services - Strategic Acco...,North Carolina,Experian is seeking an experienced Account Exe...,sales,"Security, Risk, Restructuring Consulting",senior_specialist_or_project_manager
4,Strategic Development Director (procurement),TX,Â Want to join a world-class global procuremen...,procurement_materials_logistics,Information Technology,bereichsleiter


In [91]:
data['career_level'].unique()

<StringArray>
[  'senior_specialist_or_project_manager',
                         'bereichsleiter',
                    'manager_team_leader',
          'director_business_unit_leader',
                             'specialist',
 'managing_director_small_medium_company']
Length: 6, dtype: string

## Preprocessing

### Split data

In [140]:
# Split data into features and target
target = 'career_level'
X = data.drop(labels=target,axis=1)
y = data[target]

# Split for train & test
# stratify=y ensures the same label proportion across all subsets.
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [93]:
print(y_train.value_counts())
print('--------------------------------------')
print(y_test.value_counts())

career_level
senior_specialist_or_project_manager      3470
manager_team_leader                       2138
bereichsleiter                             768
director_business_unit_leader               56
specialist                                  24
managing_director_small_medium_company       3
Name: count, dtype: Int64
--------------------------------------
career_level
senior_specialist_or_project_manager      868
manager_team_leader                       534
bereichsleiter                            192
director_business_unit_leader              14
specialist                                  6
managing_director_small_medium_company      1
Name: count, dtype: Int64


### "title" column

In [94]:
# "Title"
vectorizer = TfidfVectorizer(stop_words='english')

result = vectorizer.fit_transform(X_train['title'])
print(vectorizer.vocabulary_)
print(len(vectorizer.vocabulary_))
print(result.shape)

{'laboratory': 1542, 'manager': 1650, 'savannah': 2404, 'business': 407, 'dynamics': 887, 'consultants': 631, 'new': 1865, 'york': 3009, 'sr': 2570, 'systems': 2681, 'engineer': 958, 'head': 1264, 'sales': 2390, 'elements': 929, 'software': 2527, 'engineering': 960, 'senior': 2458, 'program': 2178, 'officer': 1919, 'chemistry': 497, 'manufacturing': 1661, 'controls': 653, 'regional': 2287, 'western': 2957, 'canada': 424, 'enterprise': 972, 'executive': 1023, 'midwest': 1763, 'security': 2444, 'solutions': 2533, 'architect': 223, 'associate': 253, 'benefits': 326, 'consulting': 635, 'itc': 1482, 'enhancements': 967, 'inside': 1408, 'development': 794, 'project': 2185, 'ii': 1356, 'digital': 809, 'analytics': 187, 'director': 818, 'training': 2771, 'quality': 2241, 'corporate': 669, 'major': 1643, 'accounts': 86, 'team': 2698, 'west': 2955, 'planning': 2090, 'analysis': 183, 'management': 1649, 'safety': 2386, 'commercial': 584, 'excellence': 1018, 'asia': 242, 'pacific': 1995, 'marketin

In [95]:
feature_names = vectorizer.get_feature_names_out()
df = pd.DataFrame(result.toarray(), columns=feature_names)
df

Unnamed: 0,00,01ef,04052,0442,05200,0656,0672,0675,0762,0810,...,year,years,yield,york,youtube,zone,zp,zpic,zv,zynga
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.431547,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
6455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
6456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
6457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
data['location'].unique()

array(['TX', 'WA', 'North Carolina', 'USA', 'NY', 'DC', 'IL', 'DE', 'CT',
       'LA', 'CA', 'NH', 'New Jersey', 'CO', 'OR', 'Connecticut', 'VA',
       'MA', 'MI', 'FL', 'GA', 'AZ', 'AK', 'NJ', 'New York', 'MO', 'OK',
       'IN', 'PA', 'MD', 'MN', 'Virginia', 'AL', 'WI', 'ME',
       'Pennsylvania', 'NE', 'OH', 'HI', 'Arkansas', 'Minnesota',
       'Nebraska', 'Massachusetts', 'Washington', 'NC', 'SD', 'UT', 'NM',
       'Illinois', 'Tennessee', 'ND', 'RI', 'PR', 'California',
       'Missouri', 'Georgia', 'Texas', 'Utah', 'WV', 'Maryland', 'KS',
       'KY', 'Ohio', 'South Carolina', 'ID', 'SC', 'IA', 'Hawaii', 'NV',
       'TN', 'Michigan', 'Indiana', 'Florida', 'VT', 'Maine', 'AR',
       'Colorado', 'Arizona', 'Kansas', 'MS', 'Mississippi', 'Wisconsin',
       'West Virginia', 'Nevada', 'Niue', 'Oregon', 'New Hampshire',
       'Calgary', 'Canada', 'New Mexico', 'Alabama', 'Iowa', 'Delaware',
       'Toronto', 'Edmonton', 'North Dakota', 'Kentucky'], dtype=object)

In [None]:
# Location
# Rút gọn, chỉ lấy mã bang -> Có nhiều cách để lọc mã bang
# Lọc khi đọc dữ liệu, hoặc sau đó (Xem ở trên)
# Sau khi làm gọn, chỉ còn lại vài chục giá trị mã bang
encoder = OneHotEncoder()
result = encoder.fit_transform(X_train[['location']])
result.shape

(6459, 94)

### "description" column

In [None]:

# Với title, dùng unigram là vừa. Nhưng với description, nên kết hợp uni and bigram
X_train['description'] = X_train['description'].fillna('')
vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1,2))

result = vectorizer.fit_transform(X_train['description'])
# print(vectorizer.vocabulary_)
print(len(vectorizer.vocabulary_))
print(result.shape) 
# unigrams  shape: (6459, 66943)
# unigrams + bigrams gram shape: (6459, 848945)

848945
(6459, 848945)


### "function" column

In [None]:
data['function'].nunique()
# 19 => dung onehot đơn giản hơn

19

### industry column

In [121]:
data['industry'].nunique()
# 352 => TF-IDF

352

# Conclusion
- Các columns trên đều dùng Onehot và IFIDF được
- TFIDF: text dài, text ngắn nhưng nhiều giá trị khác nhau
- Onehot: 

## Self-build (not included in lesson 10)

In [144]:
# Split types of columns for onehot or tf-idf
text_columns = ['title', 'description', 'function']
# short text or lesser unique values
categorical_columns = ['location', 'industry']

In [152]:
# TF-IDF transformer, use Functiontransformer
tf_idf_transformer = Pipeline(steps=[
    ('fillna', FunctionTransformer(lambda: x.fillna(''))),
    ('vectorizer', TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=10000))
])

In [151]:
# Onehot Transfomer
one_hot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [162]:
preprocessor = ColumnTransformer(transformers=[
    ('tf_idf_transformer',tf_idf_transformer, text_columns),
    ('one_hot_transformer', one_hot_transformer,categorical_columns)
])

In [163]:
logis = Pipeline(steps=[
    ('preprocessor', preprocessor),
    'classifier', LogisticRegression()
])