In [533]:
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import matplotlib.pyplot as plt

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import re

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,  plot_confusion_matrix, classification_report

In [534]:
DATASET_PATH = 'Job titles and industries.csv'

## EDA

In [535]:
df = pd.read_csv(DATASET_PATH)

### Data shape

In [536]:
print('Number of instances, features', df.shape)

Number of instances, features (8586, 2)


### Data Peek

In [537]:
print(df.head())

                                           job title industry
0  technical support and helpdesk supervisor - co...       IT
1                  senior technical support engineer       IT
2                                head of it services       IT
3                              js front end engineer       IT
4                   network and telephony controller       IT


### Data Information

In [538]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8586 entries, 0 to 8585
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   job title  8586 non-null   object
 1   industry   8586 non-null   object
dtypes: object(2)
memory usage: 134.3+ KB
None


### Data Description

In [539]:
print(df.describe())

                  job title industry
count                  8586     8586
unique                 3890        4
top     marketing executive       IT
freq                     91     4746


### Class Distribution


In [540]:
print(df.groupby('industry').size())

industry
Accountancy     374
Education      1435
IT             4746
Marketing      2031
dtype: int64


## Data Preparation

### Clean Job Titles Text

In [541]:
def clean_text(txt):
    """
    Tokenize text and remove stopwords.
    
    Parameters
    ----------
    txt : str
        String to be preprocessed.
    
    Returns
    -------
    res
        Clean string.
    """
    
    res = ""
    
    for token in simple_preprocess(txt):
        if token not in STOPWORDS and len(token) >= 2:
            res+=token+" "
            
    return res

In [542]:
def remove_special_chars(df):
    """
    Remove special characters from job title column.
    
    Parameters
    ----------
    df : pandas dataframe
        Dataframe to be preprocessed.
    
    Returns
    -------
    df
        Dataframe without duplicates.
    """
    
    clean_jobs = []
    
    for job in df['job title']:
        job = re.sub("[^A-Za-z]+"," ",job)
        clean_jobs.append(job)

    df['job title'] = clean_jobs
    
    return df

In [543]:
def preprocess_data(df):  
    """
    Preprocess data in a series of steps using pre-defined functions.
    
    Parameters
    ----------
    df : pandas dataframe
        Dataframe to be preprocessed.
    
    Returns
    -------
    df
        Preprocessed dataframe.
    """
    df = remove_special_chars(df)
    df['job title'] = df['job title'].map(clean_text)
    
    return df

### Check Preprocessing Output

In [544]:
df = preprocess_data(df)
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8586 entries, 0 to 8585
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   job title  8586 non-null   object
 1   industry   8586 non-null   object
dtypes: object(2)
memory usage: 134.3+ KB
None
                                           job title industry
0  technical support helpdesk supervisor county b...       IT
1                 senior technical support engineer        IT
2                                     head services        IT
3                                   js end engineer        IT
4                      network telephony controller        IT


### Resampling to Balance Data

In [545]:
"""
Marketing = len(df[df['industry'] == 'Marketing'])
Education = len(df[df['industry'] == 'Education'])
Accountancy = len(df[df['industry'] == 'Accountancy'])
Education = len(df[df['industry'] == 'Education'])

# Majority class indices
majority_class_indicies = df[df['industry'] == 'IT'].index 

sampling_arr = [Marketing , Education , Accountancy]
average = np.mean(sampling_arr)

random_majority_indicies = np.random.choice(majority_class_indicies , int(average) , replace = False)

Marketing_class_indicies = df[df['industry'] == 'Marketing'].index 
Education_class_indicies = df[df['industry'] == 'Education'].index 
Accountancy_class_indicies = df[df['industry'] == 'Accountancy'].index 

under_sample_indicies = np.concatenate([random_majority_indicies , Marketing_class_indicies , Education_class_indicies , Accountancy_class_indicies ])
under_sampling = df.loc[under_sample_indicies]

df = under_sampling
"""

"\nMarketing = len(df[df['industry'] == 'Marketing'])\nEducation = len(df[df['industry'] == 'Education'])\nAccountancy = len(df[df['industry'] == 'Accountancy'])\nEducation = len(df[df['industry'] == 'Education'])\n\n# Majority class indices\nmajority_class_indicies = df[df['industry'] == 'IT'].index \n\nsampling_arr = [Marketing , Education , Accountancy]\naverage = np.mean(sampling_arr)\n\nrandom_majority_indicies = np.random.choice(majority_class_indicies , int(average) , replace = False)\n\nMarketing_class_indicies = df[df['industry'] == 'Marketing'].index \nEducation_class_indicies = df[df['industry'] == 'Education'].index \nAccountancy_class_indicies = df[df['industry'] == 'Accountancy'].index \n\nunder_sample_indicies = np.concatenate([random_majority_indicies , Marketing_class_indicies , Education_class_indicies , Accountancy_class_indicies ])\nunder_sampling = df.loc[under_sample_indicies]\n\ndf = under_sampling\n"

### Splitting Data to Train and Test Sets

In [546]:
X = df['job title']
y = df['industry']

In [547]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

# To overcome imbalance in data
weights = compute_sample_weight("balanced", y_train)

## Evaluating Models

In [548]:
models = []
models.append(('Logistic Regression:', LogisticRegression()))
models.append(('Multinomial Naive Bayse:', MultinomialNB()))
models.append(('SVM:', SVC()))
models.append(('SGD Classifier:', SGDClassifier()))
models.append(('Random Forest:', RandomForestClassifier()))
models.append(('KNN:', KNeighborsClassifier()))

# Features Extraction
pipeline = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer())])
X_train_eval = pipeline.fit_transform(X_train)
                    
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=9)
    cv_results = cross_val_score(model, X_train_eval, y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

Logistic Regression: 0.925451 (0.006465)
Multinomial Naive Bayse: 0.915404 (0.003727)
SVM: 0.934771 (0.007136)
SGD Classifier: 0.930548 (0.005727)
Random Forest: 0.932732 (0.008603)
KNN: 0.911036 (0.013220)


### Train and Test Best Model (SVC Classifier)

In [549]:
# Some hyper-parameter tuning, Train
final_model = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('svc', SVC(kernel='linear'))])
final_model.fit(X_train, y_train, **{'svc__sample_weight': weights})

# Test                       
train_result = final_model.predict(X_train)
test_result = final_model.predict(X_test)

# Evaluate
print('Training score:', accuracy_score(y_train, train_result))
print('Testing score:', accuracy_score(y_test, test_result))

Training score: 0.9509318578916716
Testing score: 0.9254947613504074


### Further Evaluation

### Classification Report

In [550]:
print(classification_report(y_test, test_result))

              precision    recall  f1-score   support

 Accountancy       0.81      0.86      0.83        69
   Education       0.90      0.90      0.90       309
          IT       0.97      0.94      0.96       942
   Marketing       0.87      0.91      0.89       398

    accuracy                           0.93      1718
   macro avg       0.89      0.90      0.89      1718
weighted avg       0.93      0.93      0.93      1718



## Save Final Model for the API

In [551]:
from joblib import dump, load
dump(final_model, 'model.joblib')

['model.joblib']