# Lesson 11: Document classifier (cont)
Classify career levels

In [58]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTEN

In [4]:
def filter_location(str):
    pattern = r"\,\s[A-Z]{2}$"
    result = re.findall(pattern,str)
    if len(result) == 1:
        return result[0][2:]
    else: 
        return str
filter_location("Boston, MA")

'MA'

In [5]:
data = pd.read_excel("../datasets/final_project.ods", engine='odf',dtype="string")
data['location'] = data['location'].apply(filter_location)
data.head(2)

Unnamed: 0,title,location,description,function,industry,career_level
0,Technical Professional Lead - Process,TX,"Responsible for the study, design, and specifi...",production_manufacturing,Machinery and Industrial Facilities Engineering,senior_specialist_or_project_manager
1,Cnslt - Systems Eng- Midrange 1,WA,"Participates in design, development and implem...",information_technology_telecommunications,Financial Services,senior_specialist_or_project_manager


In [6]:
text_columns = ['title', 'description', 'industry']
cat_columns = ['location', 'function']

# Fill missing text with empty strings
data[text_columns] = data[text_columns].fillna('')

# Fill missing categories with a placeholder
data[cat_columns] = data[cat_columns].fillna('unknown')

### Split data

In [7]:
target = "career_level"
X = data.drop(labels=target, axis=1)
y = data[target]
X.head(2)

Unnamed: 0,title,location,description,function,industry
0,Technical Professional Lead - Process,TX,"Responsible for the study, design, and specifi...",production_manufacturing,Machinery and Industrial Facilities Engineering
1,Cnslt - Systems Eng- Midrange 1,WA,"Participates in design, development and implem...",information_technology_telecommunications,Financial Services


In [23]:
# stratify=y ensures the same label proportion across all subsets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42, stratify=y)
print(X_train.shape)
print(y_test.shape)

(6459, 5)
(1615,)


### Balance the data

In [None]:
# sampling theo số lượng mong muốn
ros = RandomOverSampler(random_state=42, sampling_strategy={
    "bereichsleiter":1000,
"director_business_unit_leader":500,
"specialist":    500,
"managing_director_small_medium_company": 500
})
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [None]:
# Chaỵ thử với SMOTE -> error, chỉ áp dụng numerical features, nhưng có một vài biến thể hoạt động được
ros = SMOTE(random_state=42, sampling_strategy={
    "bereichsleiter":1000,
"director_business_unit_leader":500,
"specialist":    500,
"managing_director_small_medium_company": 500
})
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

ValueError: could not convert string to float: 'Manager-Manufacturing Engineering'

In [39]:
# Chaỵ thử với SMOTEN, nhưng cần đảm bảo số lượng samples
ros = SMOTEN(random_state=42, k_neighbors=2, sampling_strategy={
    "bereichsleiter":1000,
"director_business_unit_leader":500,
"specialist":    500,
"managing_director_small_medium_company": 500
})
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [31]:
print("X train",X_train.shape)
print("X test",X_test.shape)
print("X resampled ",X_resampled.shape)
print("Y resampled", y_resampled.shape)

X train (6459, 5)
X test (1615, 5)
X resampled  (8108, 5)
Y resampled (8108,)


In [42]:
y_train.value_counts()

career_level
senior_specialist_or_project_manager      3470
manager_team_leader                       2138
bereichsleiter                             768
director_business_unit_leader               56
specialist                                  24
managing_director_small_medium_company       3
Name: count, dtype: Int64

In [41]:
y_resampled.value_counts()

career_level
senior_specialist_or_project_manager      3470
manager_team_leader                       2138
bereichsleiter                            1000
specialist                                 500
director_business_unit_leader              500
managing_director_small_medium_company     500
Name: count, dtype: Int64

In [None]:
# Improvement 1: loại bỏ một số token có tf-idf quá cao hoặc quá thấp (min_df, max_df)
preprocessor = ColumnTransformer(transformers=[
    ("title_feature", TfidfVectorizer(stop_words='english'), 'title'), # Tf-idf, n_gram=(1,1)
    ('location_feature', OneHotEncoder(handle_unknown='ignore'), ['location']),  # Onehot
    ('description_feature', TfidfVectorizer(ngram_range=(1,2), stop_words='english', min_df=0.01, max_df=0.95),'description'),     # unigrams + bigrams
    ('function_feature', OneHotEncoder(handle_unknown='ignore'), ['function']),  # Onehot
    ('industry_feature', TfidfVectorizer(ngram_range=(1,2), stop_words='english'), 'industry')     # unigrams + bigrams
])

In [63]:
# Improvement 2: Chỉ chọn một số features quan trọng nhất (SelectKBest)
cls = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("features", SelectKBest(chi2, k=200)), # k=1000, 200, ...
    
    ("model", RandomForestClassifier())
])

In [64]:
# Chỉ Chạy preprocessor (improvement 1 testing)
processed_data = cls.fit_transform(X_train)
print(processed_data.shape)

AttributeError: This 'Pipeline' has no attribute 'fit_transform'

In [65]:
cls.fit(X_train,y_train)
# y_pred = cls.predict(X_test)
# X_train

In [66]:
y_pred = cls.predict(X_test)
y_pred

array(['senior_specialist_or_project_manager',
       'senior_specialist_or_project_manager',
       'senior_specialist_or_project_manager', ..., 'manager_team_leader',
       'manager_team_leader', 'manager_team_leader'], dtype=object)

In [67]:
print(classification_report(y_test,y_pred))

                                        precision    recall  f1-score   support

                        bereichsleiter       0.52      0.34      0.42       192
         director_business_unit_leader       0.67      0.29      0.40        14
                   manager_team_leader       0.61      0.64      0.62       534
managing_director_small_medium_company       0.00      0.00      0.00         1
  senior_specialist_or_project_manager       0.81      0.87      0.84       868
                            specialist       0.00      0.00      0.00         6

                              accuracy                           0.72      1615
                             macro avg       0.44      0.36      0.38      1615
                          weighted avg       0.71      0.72      0.71      1615



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
