<center>
<h1>
Name: Nader Mohamed Elhadedy<br>
Task: Job title Classification by industry<br>
Live Host: <a href="https://job-title-classifier.herokuapp.com/">Link</a><br>


</h1>

</center>

---

> # Loading Packages

In [None]:
# import helping libraries

import numpy as np
import pandas as pd
import seaborn as sns
import nltk, re
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go

In [None]:
# fetch dataset from github
url = 'https://raw.githubusercontent.com/naderelhadedy/job_titles_dataset/main/Job%20titles%20and%20industries.csv'
df = pd.read_csv(url)

> # Doing some EDA

In [None]:
# print head of data
df.head()

Unnamed: 0,job title,industry
0,technical support and helpdesk supervisor - co...,IT
1,senior technical support engineer,IT
2,head of it services,IT
3,js front end engineer,IT
4,network and telephony controller,IT


In [None]:
# print tail of data
df.tail()

Unnamed: 0,job title,industry
8581,data entry clerk,Marketing
8582,content creator,Marketing
8583,sales & marketing manager,Marketing
8584,marketing & digital marketing consultant,Marketing
8585,creative copywriter (arabic/english),Marketing


In [None]:
# print shape of data
df.shape

(8586, 2)

In [None]:
# check null values
df.isna().sum()

job title    0
industry     0
dtype: int64

In [None]:
# check duplicates in 'job title' column
df.duplicated(subset='job title').sum()

4696

In [None]:
# drop duplicates
df.drop_duplicates(subset='job title', inplace=True)
df.shape

(3890, 2)

In [None]:
# get some info about dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3890 entries, 0 to 8585
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   job title  3890 non-null   object
 1   industry   3890 non-null   object
dtypes: object(2)
memory usage: 91.2+ KB


In [None]:
# get some statistics about dataset
df.describe().T

Unnamed: 0,count,unique,top,freq
job title,3890,3890,geography graduate harlow september 2019,1
industry,3890,4,IT,1528


In [None]:
# check balance of data
df['industry'].value_counts()

IT             1528
Marketing      1151
Education       953
Accountancy     258
Name: industry, dtype: int64

In [None]:
# visualize above results in bar chart

ser = df.industry.value_counts()
fig = px.bar(ser, x=ser.index, y=ser.values, height=500, width=1200, labels={'x':'Industry', 'y':'Count'}, title='Industry Count', template='ggplot2')
fig.update_layout(
    title_font_family="Times New Roman",
    font=dict(
    family="Courier New, monospace",
    size=18,
    color="RebeccaPurple"
    )
)
fig.show()

In [None]:
# visualize percentage of label classes in pie chart

fig = px.pie(ser, names=ser.index, values=ser.values, template='ggplot2', hole=0.1)
fig.update_layout(
    title_font_family="Times New Roman",
    font=dict(
    family="Courier New, monospace",
    size=22,
    color="RebeccaPurple",
    )
)
fig.show()

In [None]:
# store unique classes
industries = df['industry'].unique()

In [None]:
# print row of data by index
def print_plot(index):
    example = df[df.index == index][['job title', 'industry']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Industry:', example[1])

In [None]:
# check index 6
print_plot(6)

devops engineers x 3 - global brand
Industry: IT


In [None]:
# check index 9
print_plot(9)

php web developer £45,000 based in london
Industry: IT


> # Text preprocessing

# Cleaning the job titles

For our data set, the text cleaning step includes removing stop words, change text to lower case, remove punctuation, remove bad characters, and so on.<br>This will be done in 3 steps:<br>


```
- Symbols to be replaced > "REPLACE_BY_SPACE_RE"

- Symbols to be deleted > "BAD_SYMBOLS_RE"

- Predefined stop words in 'english' like common pronouns ("a", "the", ...) > "BAD_SYMBOLS_RE"
```

In [None]:
# download stopwords from NLTK library
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# clean 'job title' column

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;0-9x£]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace by space
    text = BAD_SYMBOLS_RE.sub('', text) # delete from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwords from text
    return text

In [None]:
# apply clean_text function on the desired column
df['job title'] = df['job title'].apply(clean_text)

## Checking the same rows above after cleaning

In [None]:
print_plot(6)

devops engineers global brand
Industry: IT


In [None]:
print_plot(9)

php web developer based london
Industry: IT


In [None]:
df['job title'].apply(lambda x: len(x.split(' '))).sum()

15558

## After text preprocessing, we have over **15 thousand** words to work with!

---

> # Importing Models related functions from Sklearn library

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
# split all dataset into train set and test set
X = df['job title']
y = df['industry']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # test set: 30%

> # Feature Engineering

After splitting the dataset, the next steps include feature engineering.<br>To do that, We will convert our text 'job titles' to a matrix of token counts (CountVectorizer), then transform a count matrix to a normalized tf-idf representation (tf-idf transformer). After that, we will train several classifiers and check results. We will combine these steps in a **pipeline**.

## **Model 1) Naive Bayes Classifier**

In [None]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=industries))

accuracy 0.8680377035132819
              precision    recall  f1-score   support

          IT       0.92      0.46      0.61        76
   Marketing       0.96      0.85      0.90       274
   Education       0.89      0.90      0.89       486
 Accountancy       0.78      0.93      0.85       331

    accuracy                           0.87      1167
   macro avg       0.89      0.79      0.81      1167
weighted avg       0.88      0.87      0.86      1167



## **Model 2) Logistic Regression Classifier**

In [None]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5, max_iter=100)),
               ])

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=industries))

accuracy 0.8586118251928021
              precision    recall  f1-score   support

          IT       0.83      0.64      0.73        76
   Marketing       0.90      0.85      0.88       274
   Education       0.87      0.90      0.89       486
 Accountancy       0.81      0.85      0.83       331

    accuracy                           0.86      1167
   macro avg       0.85      0.81      0.83      1167
weighted avg       0.86      0.86      0.86      1167



## **Model 3) Linear SVM Classifier**

In [None]:
linearsvc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LinearSVC()),
               ])

linearsvc.fit(X_train, y_train)

y_pred = linearsvc.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=industries))

accuracy 0.897172236503856
              precision    recall  f1-score   support

          IT       0.88      0.79      0.83        76
   Marketing       0.95      0.89      0.92       274
   Education       0.92      0.91      0.91       486
 Accountancy       0.84      0.91      0.87       331

    accuracy                           0.90      1167
   macro avg       0.90      0.87      0.88      1167
weighted avg       0.90      0.90      0.90      1167



> # Comparison between models (applying cross validation)

In [None]:
models = [
    MultinomialNB(),
    LogisticRegression(n_jobs=1, C=1e5, max_iter=100),
    LinearSVC(),
]

# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))


pipe = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
              ])

Xtr = pipe.fit_transform(X, y)

entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, Xtr, y, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:
mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis= 1, 
          ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']
acc.sort_values('Mean Accuracy', ascending=False)

Unnamed: 0_level_0,Mean Accuracy,Standard deviation
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
LinearSVC,0.874807,0.064464
MultinomialNB,0.855784,0.067523
LogisticRegression,0.834961,0.062637


In [None]:
fig = px.box(cv_df, x='model_name', y='accuracy', title='MEAN ACCURACY (cv = 5)', height=500, width=1200, template='ggplot2')
fig.update_layout(
    title_font_family="Times New Roman",
    font=dict(
    family="Courier New, monospace",
    size=18,
    color="RebeccaPurple",
    )
)
fig.show()

> # As shown above from the table and the figure, **Linear SVC classifier** performs very well and it's the best overall because it has `high mean accuracy and high f1-score` in all classes.



> # Classification metrics

#### There are many metrics to use or focus on while testing the algorithm on our data like precision, recall, f1-score, etc ... These metrics depend on the application whether to choose recall as a metric or f1-score, so, in our application we focused on **f1-score** as it depends on `precision and recall` and these two metrics depend on `FN and FP` values and both are important here in our application as we don't want the model to predict a job title as IT for example and it's not, and either don't want it to say it's not IT and it is, that's all.



> # Plotting confusion matrix for train & test sets using LinearSVC

In [None]:
# confusion matrix on train set

conf_mat = confusion_matrix(y_train, linearsvc.predict(X_train), normalize="true")

colorscale=[[0.0, 'rgb(255,255,255)'], [.2, 'rgb(255, 255, 153)'],
            [.4, 'rgb(153, 255, 204)'], [.6, 'rgb(179, 217, 255)'],
            [.8, 'rgb(240, 179, 255)'],[1.0, 'rgb(255, 77, 148)']]

fig = ff.create_annotated_heatmap(conf_mat.round(2), x=list(industries), y=list(industries), colorscale=colorscale)
fig.update_layout(
    title_font_family="Times New Roman",
    title='CONFUSION MATRIX - LinearSVC - Train Set',
    height=500,
    width=1000,
    xaxis=go.layout.XAxis(
    title=go.layout.xaxis.Title(
        text='Predicted',
    )),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='Actual',
        )
    ),
    font=dict(
    family="Courier New, monospace",
    size=22,
    color="RebeccaPurple",
    )
)
fig.show()

In [None]:
# confusion matrix on test set

conf_mat = confusion_matrix(y_test, linearsvc.predict(X_test), normalize="true")

colorscale=[[0.0, 'rgb(255,255,255)'], [.2, 'rgb(255, 255, 153)'],
            [.4, 'rgb(153, 255, 204)'], [.6, 'rgb(179, 217, 255)'],
            [.8, 'rgb(240, 179, 255)'],[1.0, 'rgb(255, 77, 148)']]

fig = ff.create_annotated_heatmap(conf_mat.round(2), x=list(industries), y=list(industries), colorscale=colorscale)
fig.update_layout(
    title_font_family="Times New Roman",
    title='CONFUSION MATRIX - LinearSVC - Test Set',
    height=500,
    width=1000,
    xaxis=go.layout.XAxis(
    title=go.layout.xaxis.Title(
        text='Predicted',
    )),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='Actual',
        )
    ),
    font=dict(
    family="Courier New, monospace",
    size=22,
    color="RebeccaPurple",
    )
)
fig.show()

> We got good results in the confusion matrix as the True values percentages are high and False values are low but there is only an issue in classifying some job titles between IT and Education as 0.14 is not a small number but still acceptable.

> # Testing model on various job titles

In [None]:
linearsvc.predict(['full stack developer'])

array(['IT'], dtype=object)

In [None]:
linearsvc.predict(['accountant'])

array(['Accountancy'], dtype=object)

In [None]:
linearsvc.predict(['content creator'])

array(['Marketing'], dtype=object)

In [None]:
linearsvc.predict(['mentor'])

array(['Education'], dtype=object)

In [None]:
linearsvc.predict(['Chief Technology Officer $680000 + 10%'])

array(['IT'], dtype=object)

In [None]:
linearsvc.predict(['CTO'])

array(['IT'], dtype=object)

In [None]:
linearsvc.predict(['CEO'])

array(['Marketing'], dtype=object)

In [None]:
linearsvc.predict(['Manager'])

array(['Marketing'], dtype=object)

> # Saving model and loading again

In [None]:
from joblib import dump, load

In [None]:
dump(linearsvc, 'model.joblib')

['model.joblib']

In [None]:
model_in = load('model.joblib')

model_in.predict(['Cyber Security'])

array(['IT'], dtype=object)

---