In [1]:
import nltk
import string
import pandas as pd
import numpy as np
import sklearn
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# reading csv file of data by pandas ana sorting by industry
data = pd.read_csv("Job titles and industries.csv")
data.sort_values("industry", inplace = True) 

In [3]:
#removing duplicate job title
data = data.drop_duplicates(subset ="job title",keep = 'first')

In [4]:
# cleaning from punctuation 
data['cleaned'] = data['job title'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))

In [5]:
# tokenizing job titles by nltk
data['tokenized_sents'] = data.apply(lambda row: nltk.word_tokenize(row['cleaned']), axis=1)

In [6]:
# word embbding using glove 
#reading glove data

def load_file_glove():
    dict = {}
    with open('glove.6B.300d.txt', 'r', encoding="utf8") as f:
        for line in f:
            dict[line.split()[0]] = list(map(float,line.split()[1:]))
        f.close()
    return  dict

In [7]:
glove_dict = load_file_glove()

In [8]:
#function to search in glove dictionary and in data then get its vector and if not found will make vector of zeros dim=300

def search_dict(data, dict):
    total_list = []
    for list in data:
        res_list = []
        for word in list:
            if word in dict.keys():
                res_list.append(dict[word])
            else:
                res_list.append([0]*300)
        total_list.append(res_list)
    return total_list

In [9]:
all_glove_data = search_dict(data['tokenized_sents'],glove_dict)

In [10]:
#this method sums all vectors of words in sent. to give one vector for each sentence (job title)
#sentence embedding 

def sum_method(data):
    sum_list = []
    temp_sum_list = []
    for x in data:
        temp_sum_list = sum(np.array(x))
        sum_list.append(temp_sum_list.tolist())
    return sum_list

In [11]:
all_sum = sum_method(all_glove_data)

In [12]:
# i will put label for each class
# ACCOUNTANCY 0
#EDUCATION 1
#IT 2
#MARKETING 3

def get_label(size_data,label):
    label_list =[]
    for i in range(size_data):
        label_list.append(label)
    return label_list
    

In [13]:
#len(data.loc[data['industry'] == 'Accountancy']) #263
#len(data.loc[data['industry'] == 'Education']) #972
#len(data.loc[data['industry'] == 'IT']) #1514
#len(data.loc[data['industry'] == 'Marketing']) #1141

acc_label = get_label(263,0)
ed_label = get_label(972,1)
it_label = get_label(1514,2)
mark_label = get_label(1141,3)


In [14]:
#this function concat to lists to put all labels in one list

def concat_data(list1,list2):
    list_all = []
    for l1 in list1 :
        list_all.append(l1)
    for l2 in list2:
         list_all.append(l2)
    return list_all

In [15]:
list1 = concat_data(acc_label,ed_label)
list2 = concat_data(list1,it_label)

all_label = concat_data(list2,mark_label)

In [16]:
#dealing with imbalanced data by over sampling it

ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(all_sum, all_label)


In [17]:
#function to split train and test

def Train_Test_Split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state = 42)
    return X_train, X_test, y_train, y_test

In [18]:
X_train, X_test, y_train, y_test = Train_Test_Split(X_resampled, y_resampled)

In [19]:
#i used log_reg for multiclass and linear-svm model it gives accuracy 0.905 but they fail to converage and its better to use KNN and KNN very easy to implement for multi-class problem

In [20]:
# def classifier_log(X_train, X_test, y_train, y_test):
#     LogReg = LogisticRegression(class_weight='balanced',random_state=0,solver='sag',C=1e7, multi_class='multinomial',max_iter=10000)
#     LogReg.fit(X_train, y_train)
#     y_pred = LogReg.predict(X_test)
#     c_matrix = confusion_matrix(y_test, y_pred)
#     acc = accuracy_score(y_test, y_pred)
#     return LogReg,acc

In [21]:
# scalar = StandardScaler(copy=True, with_mean=True, with_std=True)
# scaler = StandardScaler()
# X_train_scalar = scaler.fit_transform(X_train)  # compute mean, std and transform training data as well
# X_test_scalar = scaler.transform(X_test)
# def classification_LSVM(X_train, X_test, y_train, y_test):
#     svm  = LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,intercept_scaling=1, loss='squared_hinge', max_iter=1200,multi_class='ovr', penalty='l2' ,random_state=0, tol=1e-05,verbose=0)
#     svm.fit(X_train, y_train)
#     y_pred = svm.predict(X_test)
#     acc = accuracy_score(y_test, y_pred)
#     return svm,acc

In [22]:
# to improve model normalizing data on the same scale as KNN is not suitable for the large dimensional data
#evaluation using accuracy
#limitations KNN doesn't perform will on imbalanced data and important to features to have the same scale

def classify_KNN(X_train, X_test, y_train, y_test):
    neigh = KNeighborsClassifier(n_neighbors=5)
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    return neigh,acc

In [23]:
model_knn,acc_knn = classify_KNN(X_train, X_test, y_train, y_test)
#acc = 0.89

In [24]:
#model_log,acc_log = classifier_log(X_train, X_test, y_train, y_test)

In [25]:
#svm_model,acc_svm = classification_LSVM(X_train_scalar, X_test_scalar, y_train, y_test)

In [26]:
# def predict_log(log_reg, test , dict ):
#     test_list= []
#     tokens =nltk.word_tokenize(test)
#     test_list.append(tokens)
#     x = search_dict(test_list,dict)
#     sum_list  =  sum_method(x)
#     y_pred = log_reg.predict(sum_list)
#     return y_pred

In [27]:
# def predict_svm(svm, test , dict ):
#     test_list= []
#     tokens =nltk.word_tokenize(test)
#     test_list.append(tokens)
#     x = search_dict(test_list,dict)
#     sum_list  =  sum_method(x)
#     y_pred = svm.predict(sum_list)
#     return y_pred

In [28]:
def predict_KNN(KNN_model, test , dict ):
    test_list= []
    tokens =nltk.word_tokenize(test)
    test_list.append(tokens)
    x = search_dict(test_list,dict)
    sum_list  =  sum_method(x)
    y_pred = KNN_model.predict(sum_list)
    return y_pred

In [29]:
def output(prediction):
    if prediction ==0 :
        print('Accountancy')
    elif prediction == 1:
        print('Education')
    elif prediction == 2:
        print('IT')
    else :
        print('Marketing')

In [31]:
input_test = input('enter test= ')
prediction= predict_KNN(model_knn,input_test,glove_dict)
output(prediction)

enter test= trainee accountant
Accountancy
