In [1]:
import pandas as pd
import io
import re
import string
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\medo_\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

## Reading Data

In [2]:
# from google.colab import files
# uploaded = files.upload()
# data = pd.read_csv('Job titles and industries.csv')

Saving Job titles and industries.csv to Job titles and industries.csv


In [5]:
data = pd.read_csv(".\lib\data\Job titles and industries.csv") 

# Data preprocessing

## text cleaning

removing addetinal details about work in title like address, salary ...etc, and removing non effective punckt, for stop works there is really no need to remove them as all titles aleady contains small words count and almost no stop words

In [6]:
cleanD = data.copy()
# this pattern matches the addetional work info after (- or ,), also it matches $,£ for salary info, also removes parentheses with experince time - part time ..etc
pattern = re.compile('((-\s.+)|(,\s.+)|((£|$).+))|(\(.*(\d|part).*\))')
for i,R in cleanD.iterrows():
  #(-\s.+)|(,\s.+)
  if re.search(pattern, R[0]): 
    #print(re.search(pattern, R[0]))
    R[0] = re.sub(pattern, "", R[0])    

In [7]:
pattern2 = re.compile('-|!|\(|\)|&|,|/|:|\'|\"|@|_|\$|%|\*') #removing all punckt except (.,+) pecause its used like .net,c++ ...etc 
for i,R in cleanD.iterrows():
  if re.search(pattern2, R[0]): 
     R[0] = re.sub(pattern2, "", R[0])

In [8]:
#stemming doesn't improve accuracy so it isn't used
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
def stemming(inp): 
    tokenizer=nltk.tokenize.WordPunctTokenizer()
    tokens = tokenizer.tokenize(inp)
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    stemming_words = [stemmer.stem(w) for w in tokens]
    return(' '.join(stemming_words))

In [9]:
#cleanD['job title'] = cleanD['job title'].apply(stemming)

## Balancing Data

to solve the problem of impalanced data I will use upsampling(oversampling), as the # of point is too small to do downsampling, and  imblearn’s SMOTE or Synthetic Minority Oversampling isn't preferable for text features.

In [10]:
# setting up testing and training sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cleanD['job title'], cleanD['industry'], test_size=0.2, random_state=1)

# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# Separate majority and minority classes
df_it = X[X.industry=='IT']
df_marketing = X[X.industry=='Marketing']
df_education = X[X.industry=='Education']
df_accountancy = X[X.industry=='Accountancy']

from sklearn.utils import resample
# Upsample minority classes
df_marketing_upsampled = resample(df_marketing, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_it),    # to match majority class
                                 random_state=123) # reproducible results

df_education_upsampled = resample(df_education, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_it),    # to match majority class
                                 random_state=27) # reproducible results
df_accountancy_upsampled = resample(df_accountancy, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_it),    # to match majority class
                                 random_state=56) # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_it, df_marketing_upsampled,df_education_upsampled,df_accountancy_upsampled])
 
# Display new class counts
df_upsampled.industry.value_counts()

Accountancy    3797
IT             3797
Education      3797
Marketing      3797
Name: industry, dtype: int64

Now we have balanced data & all calsses have the same number of training points

## Text Vectorization

In [11]:
# Vectorize the Data using tfidfVectorizer() {use appropriate parameters}
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)
vectorizer.fit(df_upsampled['job title'])
x_train = vectorizer.transform(df_upsampled['job title'])
x_test = vectorizer.transform(X_test)
y_train = df_upsampled['industry']

In [12]:
x_test

<1718x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 4857 stored elements in Compressed Sparse Row format>

# Training

In [16]:
# train RandomForestModel on the vectorized train data and score accuracy on test data
from  sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
classifier = RandomForestClassifier(n_estimators=1000, random_state=1)  
classifier.fit(x_train.toarray(), y_train)
prediction = classifier.predict(x_test.toarray())

# View test results

In [17]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,prediction))  
print(classification_report(y_test,prediction))  
print(accuracy_score(y_test, prediction))  

[[ 66   6   1   5]
 [  3 241   4  12]
 [  4   2 918  25]
 [  3  23  30 375]]
              precision    recall  f1-score   support

 Accountancy       0.87      0.85      0.86        78
   Education       0.89      0.93      0.91       260
          IT       0.96      0.97      0.97       949
   Marketing       0.90      0.87      0.88       431

   micro avg       0.93      0.93      0.93      1718
   macro avg       0.90      0.90      0.90      1718
weighted avg       0.93      0.93      0.93      1718

0.9313154831199069


# Saving model and vecorizer for later use by flask

In [18]:
import pickle
pickle.dump(classifier, open('lib/models/Classifier.pkl', 'wb'))
pickle.dump(vectorizer, open('lib/models/Vectorizer.pkl', 'wb'))