# Lesson 11: Document classifier (cont)
Classify career levels

In [101]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [73]:
def filter_location(str):
    pattern = r"\,\s[A-Z]{2}$"
    result = re.findall(pattern,str)
    if len(result) == 1:
        return result[0][2:]
    else: 
        return str
filter_location("Boston, MA")

'MA'

In [74]:
data = pd.read_excel("../datasets/final_project.ods", engine='odf',dtype="string")
data['location'] = data['location'].apply(filter_location)
data.head(2)

Unnamed: 0,title,location,description,function,industry,career_level
0,Technical Professional Lead - Process,TX,"Responsible for the study, design, and specifi...",production_manufacturing,Machinery and Industrial Facilities Engineering,senior_specialist_or_project_manager
1,Cnslt - Systems Eng- Midrange 1,WA,"Participates in design, development and implem...",information_technology_telecommunications,Financial Services,senior_specialist_or_project_manager


In [75]:
text_columns = ['title', 'description', 'industry']
cat_columns = ['location', 'function']

# Fill missing text with empty strings
data[text_columns] = data[text_columns].fillna('')

# Fill missing categories with a placeholder
data[cat_columns] = data[cat_columns].fillna('unknown')

### Split data

In [76]:
target = "career_level"
X = data.drop(labels=target, axis=1)
y = data[target]
X.head(2)

Unnamed: 0,title,location,description,function,industry
0,Technical Professional Lead - Process,TX,"Responsible for the study, design, and specifi...",production_manufacturing,Machinery and Industrial Facilities Engineering
1,Cnslt - Systems Eng- Midrange 1,WA,"Participates in design, development and implem...",information_technology_telecommunications,Financial Services


In [77]:
# stratify=y ensures the same label proportion across all subsets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42, stratify=y)
print(X_train.shape)
print(y_test.shape)

(6459, 5)
(1615,)


In [93]:
preprocessor = ColumnTransformer(transformers=[
    ("title_feature", TfidfVectorizer(stop_words='english'), 'title'), # Tf-idf, n_gram=(1,1)
    ('location_feature', OneHotEncoder(handle_unknown='ignore'), ['location']),  # Onehot
    ('description_feature', TfidfVectorizer(ngram_range=(1,2), stop_words='english'),'description'),     # unigrams + bigrams
    ('function_feature', OneHotEncoder(handle_unknown='ignore'), ['function']),  # Onehot
    ('industry_feature', TfidfVectorizer(ngram_range=(1,2), stop_words='english'), 'industry')     # unigrams + bigrams
])
cls = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier())
])

In [94]:
cls.fit(X_train,y_train)
# y_pred = cls.predict(X_test)
# X_train

In [95]:
y_pred = cls.predict(X_test)
y_pred

array(['manager_team_leader', 'senior_specialist_or_project_manager',
       'senior_specialist_or_project_manager', ...,
       'senior_specialist_or_project_manager',
       'senior_specialist_or_project_manager',
       'senior_specialist_or_project_manager'], dtype=object)

In [104]:
print(classification_report(y_test,y_pred))

                                        precision    recall  f1-score   support

                        bereichsleiter       0.64      0.05      0.09       192
         director_business_unit_leader       1.00      0.29      0.44        14
                   manager_team_leader       0.62      0.53      0.57       534
managing_director_small_medium_company       0.00      0.00      0.00         1
  senior_specialist_or_project_manager       0.72      0.95      0.82       868
                            specialist       0.00      0.00      0.00         6

                              accuracy                           0.69      1615
                             macro avg       0.50      0.30      0.32      1615
                          weighted avg       0.68      0.69      0.64      1615



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
