<a href="https://colab.research.google.com/github/milazudina/ds4a_team36/blob/main/extract_skills.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import spacy
import random
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

nlp = spacy.load("en_core_web_sm")
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Preprocess Job Descriptions

In [None]:
# read in all the JPs that will need to have skills extracted
df_uk = pd.read_csv('df_UK_20211017_combined.csv')
df_uk["Dataset"] = "web_scraped_uk"
df_us = pd.read_csv('df_CA_20211017_combined.csv')
df_us["Dataset"] = "web_scraped_ca"
df_kaggle = pd.read_csv('consolidated_df_kaggle.csv')
df_kaggle["Dataset"] = "kaggle"

print(df_kaggle.columns)
print(df_uk.columns)
print(df_us.columns)

In [50]:
df_kaggle = df_kaggle.rename({'Job_title': 'Job_Title', 
                      'No_of_Stars': 'Company_Rating', 
                      'No_of_Reviews': 'Number_of_Reviews_of_the_Company',
                      'Company_Employees': 'Company_Size',
                      'Adjusted_Industry': 'Industry',
                      'Link': 'Job_URL',
                      'Queried_Salary': 'Salary'}, axis='columns')

df_kaggle["Region"] = "USA"
df_kaggle = df_kaggle.drop(columns=['Date_Since_Posted', 'Company_Industry', 'Skill', 'No_of_Skills'])
df_us = df_us.drop(columns=['Company_Founded_Year', 'Company_URL'])
df_uk = df_uk.drop(columns=['Company_Founded_Year', 'Company_URL'])

df = pd.concat([df_uk, df_us, df_kaggle], axis = 0, ignore_index=True)

In [None]:
job_description = df['Description'][0].lower().replace("\n", "") # replacing 'next line' with nothing (rather than space) works best
#print(job_description)

text = nlp(job_description) 

# we will extract nouns into this list
text_nouns = list()

# this outer loop cuts the job description text into sentences and extracts the nouns
for text in text.sents:
  
    sentence = nlp(text.string.strip())
    sentence_nouns = [chunk.text for chunk in sentence.noun_chunks]
    # this next loop is "cleaning" the noun phrases
    sentence_nouns_clean = list()
    for i in range(0,len(sentence_nouns)):
      # let's remove all the articles (a, an, the)
      temp2 = re.sub("^a ", "", sentence_nouns[i])
      temp3 = re.sub("^an ", "", temp2)
      temp4 = temp3.replace("the ", "").replace(")", "").replace("(", "").replace("e.g.", "eg").replace("£", "").replace("$", "").replace("%", "").replace("e g", "eg").replace(".", "").replace(",", "").replace(":", "").replace(";", "").replace("?", "").replace("*", "")
      # let's remove the numbers and %,£,$ and all of that
      temp4 = re.sub("[0-9]", "", temp4)
      print(temp4)
      # remove the stop words (It would be faster to remove it from the text directly, but I don't know if it will affect how it parses the text into noun phrases) 
      temp5 = [w for w in temp4.split(" ") if not w in STOPWORDS]
      temp5 = " ".join(temp5)
      #print(temp5)

      if len(temp5) != 0:
        sentence_nouns_clean = sentence_nouns_clean + temp5.split("blah") # this last split is just to convert the string to a list
        text_nouns = text_nouns + sentence_nouns_clean

print(text_nouns)
print(len(text_nouns))

In [None]:
# loop through all the job postings to extract all the nouns - for every noun put a number corresponding to where it was extracted from
# takes a couple of minutes to run

all_nouns = pd.DataFrame(columns=['Number', 'Region', 'Job_title', 'Noun'])

for i in range(0,df.shape[1]):

  job_description = df['Description'][i].lower().replace("\n", "") # I determined experimentally that replacing 'next line' with nothing (rather than space) works best

  ## Clean the text
  job_description = re.sub("-", " ", job_description)
  job_description = re.sub("/", " or ", job_description)
  job_description = job_description.lower().replace("\n", "").replace("</b>", "").replace("</p>", "").replace("<b>", "").replace("<p>", "").replace("</li>", "").replace("</ul>", "").replace("<li>", "").replace("<ul>", "").replace("<i>", "").replace("</i>", "").replace("\r", "").replace("<div>", "").replace("h2", "").replace("h3","").replace("h1","")
  job_description = job_description.replace("</h1>", "").replace("</div>", "").replace("/h2", "").replace("/h3","")
  job_description = re.sub("-", " ", job_description)
  job_description = re.sub("/", " ", job_description)
  job_description = re.sub(" a ", " ", job_description)
  job_description = re.sub(" an ", " ", job_description)
  job_description = re.sub("[0-9]", "", job_description)
  job_description = job_description.replace(" the ", " ").replace(")", "").replace("(", "").replace("e.g.", "").replace("£", "").replace("$", "").replace("%", "").replace("e g", "").replace(".", " ").replace(",", " ").replace(":", "").replace(";", "").replace("?", "").replace("*", "").replace(" eg ", "").replace(">", "").replace("<", "")
  
  text = nlp(job_description)  

  text_nouns = list()
  for text in text.sents:
    sentence = text.string.strip()
    sentence = nlp(sentence)
    sentence_nouns = [chunk.text for chunk in sentence.noun_chunks]  
    sentence_nouns_clean = list()
    for j in range(0,len(sentence_nouns)):
      # let's remove all the articles (a, an, the)
      temp2 = re.sub("^a ", "", sentence_nouns[j])
      temp3 = re.sub("^an ", "", temp2)
      temp4 = temp3.replace("the ", "").replace(")", "").replace("(", "").replace("e.g.", "").replace("£", "").replace("$", "").replace("%", "").replace("e g", "").replace(".", "").replace(",", "").replace(":", "").replace(";", "").replace("?", "").replace("*", "").replace("eg", "")
      # let's remove the numbers and %,£,$ and all of that
      temp4 = re.sub("[0-9]", "", temp4)
      #print(temp4)
      # remove the stop words (It would be faster to remove it from the text directly, but I don't know if it will affect how it parses the text into noun phrases) 
      temp5 = [w for w in temp4.split(" ") if not w in STOPWORDS]
      temp5 = " ".join(temp5)
      #print(temp5)

      if len(temp5) != 0:
        sentence_nouns_clean = sentence_nouns_clean + temp5.split("blah") # this last split is just to convert the string to a list
        text_nouns = text_nouns + sentence_nouns_clean

  nouns = pd.DataFrame({'Noun':text_nouns})
  number = [i] * len(text_nouns)
  number = pd.DataFrame({'Number':number})
  region = [df['Region'][i]] * len(text_nouns)
  region = pd.DataFrame({'Region':region})
  job_title = [df['Job_title'][i]] * len(text_nouns)
  job_title = pd.DataFrame({'Job_title':job_title})

  temp = pd.concat([number, region, job_title, nouns], axis=1)
  #print(type(all_nouns))
  #print(type(temp))

  all_nouns_uk = pd.concat([all_nouns_uk, temp], axis = 0)


In [68]:
# loop through all the job postings to extract all the nouns - for every noun put a number corresponding to where it was extracted from
# takes a couple of minutes to run

all_nouns = pd.DataFrame(columns=['Number', 'Region', 'Job_Title', 'Noun'])

for i in range(0, df.shape[1]):

  job_description = df['Description'][i].lower().replace("\n", "") # I determined experimentally that replacing 'next line' with nothing (rather than space) works best

  ## Clean the text
  job_description = re.sub("’re", " are ", job_description)
  job_descriptionxt = re.sub("’s", " ", job_description)
  job_descriptionxt = re.sub("what’s", "what is ", job_description)
  job_description = re.sub("’ve", " have ", job_description)
  job_description = re.sub("n’t", " not ", job_description)
  job_description = re.sub("i’m", "i am ", job_description)
  job_description = re.sub("’d", " would ", job_description)
  job_description = re.sub("’ll", " will ", job_description)
  job_description = re.sub("-", " ", job_description)
  job_description = re.sub("/", " or ", job_description)
  
  text = nlp(job_description)  

  text_nouns = list()
  for text in text.sents:
    sentence = text.string.strip()
    sentence = nlp(sentence)
    sentence_nouns = [chunk.text for chunk in sentence.noun_chunks]  
    sentence_nouns_clean = list()
    for j in range(0,len(sentence_nouns)):
      # let's remove all the articles (a, an, the)
      temp2 = re.sub("^a ", "", sentence_nouns[j])
      temp3 = re.sub("^an ", "", temp2)
      temp4 = temp3.replace("the ", "").replace(")", "").replace("(", "").replace("e.g.", "").replace("£", "").replace("$", "").replace("%", "").replace("e g", "").replace(".", "").replace(",", "").replace(":", "").replace(";", "").replace("?", "").replace("*", "").replace("eg", "")
      # let's remove the numbers and %,£,$ and all of that
      temp4 = re.sub("[0-9]", "", temp4)
      #print(temp4)
      # remove the stop words (It would be faster to remove it from the text directly, but I don't know if it will affect how it parses the text into noun phrases) 
      temp5 = [w for w in temp4.split(" ") if not w in STOPWORDS]
      temp5 = " ".join(temp5)
      #print(temp5)

      if len(temp5) != 0:
        sentence_nouns_clean = sentence_nouns_clean + temp5.split("blah") # this last split is just to convert the string to a list
        text_nouns = text_nouns + sentence_nouns_clean

  nouns = pd.array(text_nouns)
  nouns = nouns.unique()
  nouns = pd.DataFrame({'Noun':nouns})
  number = [i] * len(text_nouns)
  number = pd.DataFrame({'Number':number})
  region = [df['Region'][i]] * len(text_nouns)
  region = pd.DataFrame({'Region':region})
  job_title = [df['Job_Title'][i]] * len(text_nouns)
  job_title = pd.DataFrame({'Job_Title':job_title})

  temp = pd.concat([number, region, job_title, nouns], axis=1)
  #print(type(all_nouns))
  #print(type(temp))

  all_nouns = pd.concat([all_nouns, temp], axis = 0)


In [None]:
all_nouns_unique = all_nouns.drop_duplicates(subset="Noun")
patterns_to_exclude = ['@','www'] 
pattern = '|'.join(patterns_to_exclude)
all_nouns_unique = all_nouns_unique[~all_nouns_unique["Noun"].str.contains(pattern)]
print(all_nouns_unique.shape)
#all_nouns_unique.tail(30)

In [None]:
all_nouns_unique.to_csv("all_nouns_unique_20211009.csv")
all_nouns.to_csv("all_nouns_20211009.csv")

In [None]:
skill_list = pd.read_csv("df_Elroy_skill_count.csv", index_col = None, header = 0)
type(skill_list["Skill"])
#all_kaggle_skills = df_kaggle_skills['Skill'].tolist()
skill_list["skill_lowerkey"] = [x.lower() for x in skill_list["Skill"]]
skill_list[0:10]

Unnamed: 0.1,Unnamed: 0,Skill,Count,skill_lowerkey
0,0,SAP,112,sap
1,1,SQL,3104,sql
2,2,MachineLearning,2297,machinelearning
3,3,R,2234,r
4,4,SAS,941,sas
5,5,Python,3325,python
6,6,DataMining,1059,datamining
7,7,DataManagement,121,datamanagement
8,8,STATA,90,stata
9,9,SPSS,278,spss


In [None]:
all_nouns_unique.head(10)
all_nouns_unique["noun_lowercase"] = [x.replace(' ', '') for x in all_nouns_unique["Noun"]]
all_nouns_unique[0:10]

In [None]:
all_nouns_unique["Skill"] = all_nouns_unique['noun_lowercase'].isin(skill_list["skill_lowerkey"].tolist())
all_nouns_unique.loc[all_nouns_unique["Skill"] == True, "Skill"] = 1
all_nouns_unique.loc[all_nouns_unique["Skill"] == False, "Skill"] = 0
all_nouns_unique.head(10)
all_nouns_unique.to_csv("all_nouns_labeled_20211009.csv")

In [None]:
# deprecated
#count = 0
#counter = 0
#for i in training_set_ids:
#  count = training_set[training_set['Number'] == i].count(0)["Number"] + count
#  print(counter, " ", count)
#  counter = counter + 1

In [None]:
#training_set.loc[training_set['Number'].isin(training_set_ids[:3])].to_csv("nouns_training_set_pt1.csv", index = False)
#training_set.loc[training_set['Number'].isin(training_set_ids[3:9])].to_csv("nouns_training_set_pt2.csv", index = False)
#training_set.loc[training_set['Number'].isin(training_set_ids[9:15])].to_csv("nouns_training_set_pt3.csv", index = False)
#training_set.loc[training_set['Number'].isin(training_set_ids[15:20])].to_csv("nouns_training_set_pt4.csv", index = False)


In [None]:
#print(training_set.loc[training_set['Number'].isin(training_set_ids[:3])].shape)
#print(len(training_set.loc[training_set['Number'].isin(training_set_ids[3:9])]))
#print(len(training_set.loc[training_set['Number'].isin(training_set_ids[9:15])]))
#print(len(training_set.loc[training_set['Number'].isin(training_set_ids[15:20])]))

# LSTM + Embedding


In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, SpatialDropout1D

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
import tensorflow as tf
from tensorflow import keras

import pandas as pd
import seaborn as sns
import numpy as np
import re

In [None]:
# first let's get the labelled dataset in
# load all 4 csvs
#pt1 = pd.read_csv("/content/nouns_training_set_pt1_labeled.csv")
#pt2 = pd.read_csv("nouns_training_set_pt2_labeled.csv")
#pt3 = pd.read_csv("nouns_training_set_pt3_labeled.csv")
#pt4 = pd.read_csv("nouns_training_set_pt4_labeled.csv")

In [None]:
# one of the files has a slightly different format, let's make it same as others
#pt3 = pt3.rename({'label': 'Label', 'confidence': 'Confidence'}, axis='columns')
#pt3 = pt3.iloc[:,0:6]
#df = pd.concat([pt1, pt2, pt3, pt4], axis = 0, ignore_index=True)

# just a general check
#sns.countplot(x='Label',data=df)

all_nouns_labeled = pd.read_csv("all_nouns_labeled_20211009.csv")

training_set = all_nouns_labeled[0:2000]
training_set['labeled_skill'].value_counts()

0    1683
1     249
2      43
3      25
Name: labeled_skill, dtype: int64

In [None]:
# split into input (X) and output (y) - this will be later split into the testing and training set
all_nouns = np.asarray(all_nouns_labeled.loc[:,"Noun"])
y = all_nouns_labeled.iloc[0:2000,6] # labels

# to start with, I will only use 1s and 0s
y = y.replace({2:1, 3:1})
print(y.value_counts())
y = y.tolist()

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(all_nouns)
vocab_size = len(t.word_index)+1
print(vocab_size)

# integer encode the documents
encoded_all_nouns = t.texts_to_sequences(all_nouns)
print(len(encoded_all_nouns))

list_len = [len(i) for i in encoded_all_nouns]
print(max(list_len))
#print(np.argmax(np.array(list_len)))

# pad documents to a max length
max_length = max(list_len)
padded_all_nouns = pad_sequences(encoded_all_nouns, maxlen=max_length, padding = 'post')


0    1683
1     317
Name: labeled_skill, dtype: int64
11703
48765
21


In [None]:
def make_dummy_var(y):
  temp = np.zeros([len(y), 2])
  for i in range(0,len(y)):
    if y[i] == 0:
      temp[i,0] = 1
      temp[i,1] = 0
    elif y[i] == 1:
      temp[i,0] = 0
      temp[i,1] = 1
  return temp

y_onehot = make_dummy_var(y)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_all_nouns[0:len(y_onehot)], y_onehot, test_size=0.33)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1340, 21)
(660, 21)
(1340, 2)
(660, 2)


In [None]:
# class imbalance issue
print(1-len(y_train[y_train[:,0] == 1])/(len(y_train[y_train[:,0] == 0])+len(y_train[y_train[:,0] == 1])))
print(1-len(y_test[y_test[:,0] == 1])/(len(y_test[y_test[:,0] == 0])+len(y_test[y_test[:,0] == 1])))

0.1649253731343283
0.1454545454545455


In [None]:
embeddings_index = dict()
# can download from https://nlp.stanford.edu/projects/glove/
f = open('glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [None]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
count = 0
for word, i in t.word_index.items():
	#print(word)
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		count = count + 1
		embedding_matrix[i] = embedding_vector
print(count)
# so about 1,000 words is not found

10014


In [None]:
glove_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False)

In [None]:
model = Sequential()
model.add(glove_layer)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(256))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softplus'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

# add learning rate parameter

# fit the model
model.fit(X_train, y_train, epochs=50, verbose=1, batch_size=32)

# evaluate the model
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
print('Accuracy: %f' % (accuracy*100))

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print('Test accuracy: %f' % (test_accuracy*100))

# first iteration (before a thourough QC) 
# Accuracy: 97.571427
# Test accuracy: 80.794168

In [None]:
predicted_y = model.predict(padded_all_nouns)

In [None]:
# this will show the predicted and manual labels for the training set 
predicted_class=np.argmax(predicted_y[0:len(y_onehot)],axis=1)

test_nouns_with_labels = np.vstack((np.asarray(all_nouns[0:len(y_onehot)]), predicted_class, y_onehot[:,1]))

predicted_vs_labeled = pd.DataFrame(data=test_nouns_with_labels.transpose(),
                  columns=["Noun", "Predicted label", "Manual label"])



In [None]:
predicted_vs_labeled["Predicted label"].value_counts()

0    1667
1     333
Name: Predicted label, dtype: int64

In [None]:
print(predicted_vs_labeled[480:500])

                                     Noun Predicted label Manual label
480                       data scientists               0            0
481                           data assets               0            0
482        truly data driven organisation               0            0
483                 successful candidates               0            0
484                   actionable insights               0            0
485              global data architecture               0            0
486              local business processes               0            0
487                             reporting               0            0
488            business data architecture               1            1
489                             pipelines               1            1
490                   financial reporting               1            1
491                       ad hoc analysis               0            1
492                       critical issues               0            0
493   

In [None]:
# this will show the predicted labels for the rest of them
predicted_class=np.argmax(predicted_y,axis=1)

test_nouns_with_labels = np.vstack((np.asarray(all_nouns_unique), predicted_class))

predicted = pd.DataFrame(data=test_nouns_with_labels.transpose(),
                  columns=["Noun", "Predicted label"])

predicted["Predicted label"].value_counts()
#print(predicted[550:600])


9        chosen dsp digital marketing operations
17                        technical capabilities
18                               data extraction
20                                   soft skills
22                                       holborn
                          ...                   
46741                     disparate data systems
46743                       strong data analysis
46744                 scientific thinking skills
46745       data exploration visualization tools
46746                       warehousing concepts
Name: Noun, Length: 9200, dtype: object


In [None]:
len(all_nouns_uk)

1082776

In [None]:
all_nouns_labeled.head(10)

Unnamed: 0.1,Unnamed: 0,Number,Region,Job_title,Noun,noun_lowercase,labeled_skill
0,0,0,UK,Machine Learning Associate,machine learning associate,machinelearningassociate,0
1,2,0,UK,Machine Learning Associate,central technology group,centraltechnologygroup,0
2,5,0,UK,Machine Learning Associate,boundaries,boundaries,0
3,9,0,UK,Machine Learning Associate,games,games,0
4,14,0,UK,Machine Learning Associate,artificial intelligence,artificialintelligence,1
5,20,0,UK,Machine Learning Associate,processes,processes,0
6,27,0,UK,Machine Learning Associate,problems,problems,0
7,35,0,UK,Machine Learning Associate,solutions,solutions,0
8,44,0,UK,Machine Learning Associate,machine learning challenges,machinelearningchallenges,0
9,54,0,UK,Machine Learning Associate,players,players,0


In [None]:
# extract the skills from all of them
#predicted_class=np.argmax(predicted_y,axis=1)

#test_nouns_with_labels = np.vstack((np.asarray(all_nouns), predicted_class))

#predicted = pd.DataFrame(data=test_nouns_with_labels.transpose(), columns=["Noun", "Predicted label"])

skills = predicted.loc[predicted["Predicted label"] == 1, "Noun"]
print(skills)

skills = skills.to_list() + all_nouns_labeled.loc[all_nouns_labeled["labeled_skill"] == 1, "Noun"].to_list()
##skills = skills.dropna()

print(skills)

pd.DataFrame(skills).to_csv("skills_2021-10-10.csv")

4                     artificial intelligence
23              machine learning applications
31                                 automation
32                            computer vision
34                          generative models
                         ...                 
48741                  disparate data systems
48743                    strong data analysis
48744              scientific thinking skills
48745    data exploration visualization tools
48746                    warehousing concepts
Name: Noun, Length: 9533, dtype: object


In [None]:
# model 0 - before the nouns for prediction were added to embedding - works better

all_nouns = np.asarray(all_nouns_labeled.iloc[0:2000,4])
y = all_nouns_labeled.iloc[0:2000,6] # labels

# to start with, I will only use 1s and 0s
y = y.replace({2:1, 3:1})
print(y.value_counts())
y = y.tolist()

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(all_nouns)
vocab_size = len(t.word_index)+1
print(vocab_size)

# integer encode the documents
encoded_all_nouns = t.texts_to_sequences(all_nouns)
print(len(encoded_all_nouns))

list_len = [len(i) for i in encoded_all_nouns]
print(max(list_len))
#print(np.argmax(np.array(list_len)))

# pad documents to a max length
max_length = max(list_len)
padded_all_nouns = pad_sequences(encoded_all_nouns, maxlen=max_length, padding = 'post')

X_train, X_test, y_train, y_test = train_test_split(padded_all_nouns[0:len(y_onehot)], y_onehot, test_size=0.33)


0    1683
1     317
Name: labeled_skill, dtype: int64
1629
2000
11


In [None]:
# create a weight matrix for words in training docs
embedding_matrix_training_set = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix_training_set[i] = embedding_vector

In [None]:
glove_layer_training = Embedding(vocab_size, 100, weights=[embedding_matrix_training_set], input_length=max_length, trainable=False)

In [None]:
model0 = Sequential()
model0.add(glove_layer_training)
#model0.add(SpatialDropout1D(0.2))
model0.add(LSTM(256))
model0.add(Dense(128, activation='relu'))
model0.add(Dense(64, activation='relu'))
model0.add(Dense(32, activation='relu'))
model0.add(Dense(2, activation='softplus'))

optimizer = keras.optimizers.Adam(lr=0.0001)

model0.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
print(model0.summary())

# fit the model
model0.fit(X_train, y_train, epochs=50, verbose=0)

# evaluate the model
loss, accuracy = model0.evaluate(X_train, y_train, verbose=0)
print('Accuracy: %f' % (accuracy*100))

test_loss, test_accuracy = model0.evaluate(X_test, y_test)
print('Test accuracy: %f' % (test_accuracy*100))

# Accuracy: 94.571429
# Test accuracy: 81.442463
# but bear in mind that by random it would be 0.83 and 0.76 respectively

In [None]:
#predicted_y = model0.predict(testing_X)

predicted_class=np.argmax(predicted_y,axis=1)

print(predicted_y.shape)
print(np.asarray(X[700:len(y)]).shape)
print(predicted_y)
print(np.asarray(X[700:len(y)]))
print(predicted_class.shape)
print(predicted_class)

test_nouns_with_labels = np.vstack((np.asarray(X[700:len(y)]), predicted_class, testing_y))

(1234, 2)
(1234,)
[[9.0182149e-01 1.1743873e-01]
 [1.5652179e+00 5.2744355e-03]
 [2.9110932e+00 2.5139281e-05]
 ...
 [2.6329508e+00 6.3010040e-05]
 [3.1241448e+00 8.5519760e-06]
 [2.9847383e+00 1.6128277e-05]]
['dependencies' 'common elements' 'principal responsibilities' ...
 'sponsorship' 'tier 2 general visas' 'only candidates']
(1234,)
[0 0 0 ... 0 0 0]


In [None]:
predicted_vs_labeled = pd.DataFrame(data=test_nouns_with_labels.transpose(),
                  columns=["Noun", "Predicted", "Labeled manually"])

predicted_vs_labeled[30:40]



```
# This is formatted as code
```

# Make a dataframe of Job postings & skills

['Job_Title', 'Link', 'Queried_Salary', 'Job_Type', 'Skill', 'No_of_Skills', 'Company', No_of_Reviews', 'No_of_Stars', 'Date_Since_Posted', 'Description', 'Location', ‘Company_Revenue', 'Company_Employees', 'Company_Industry']

In [None]:
df_uk = pd.read_csv("df_UK_20211004_combined.csv")

df_uk = df_uk.rename({'Job_title': 'Job_Title', 
                      'Company_Rating': 'No_of_Stars', 
                      'Number_of_Reviews_of_the_Company': 'No_of_Reviews',
                      'Region': 'Location',
                      'Company_Size': 'Company_Employees',
                      'Industry': 'Company_Industry',
                      'Job_URL': 'Link',
                      'Salary': 'Queried_Salary'}, axis='columns')

df_uk.head(10)

In [None]:
# read in skills file
skills = pd.read_csv("skills_2021-10-10-2.csv")
skills.head(10)

unique_skills = skills.iloc[:,1].unique()
# read in all nouns file
unique_skills[0:20]

array(['artificial intelligence', 'machine learning applications',
       'automation', 'computer vision', 'generative models',
       'data collection', 'latest research data techniques',
       'good mathematics skills', 'linear algebra',
       'numerical optimization', 'machine learning tools', 'c++',
       'python', 'cig', 'analytics', 'data science solutions',
       'returners', 'mentorship', 'presentations', 'desk'], dtype=object)

In [None]:
for i in range(0,df_uk.shape[0]):


  # for the number of job postings in the UK

  nouns_from_job_description = all_nouns_uk.loc[all_nouns_uk["Number"] == i, "Noun"]
  extracted_skills = nouns_from_job_description.loc[nouns_from_job_description.isin(skills).values == True]
  extracted_skills_unique = extracted_skills.unique()
  #print(extracted_skills_unique)
  df_uk["Skill"][i] = extracted_skills_unique


In [None]:
df_uk.tail(10)

In [None]:
df_uk.to_csv("df_UK_2021-10-04_with_skills.csv")

In [None]:
df_us = pd.read_csv("df_CA_20211004_combined.csv")

df_us = df_us.rename({'Job_title': 'Job_Title', 
                      'Company_Rating': 'No_of_Stars', 
                      'Number_of_Reviews_of_the_Company': 'No_of_Reviews',
                      'Region': 'Location',
                      'Company_Size': 'Company_Employees',
                      'Industry': 'Company_Industry',
                      'Job_URL': 'Link',
                      'Salary': 'Queried_Salary'}, axis='columns')

df_us.head(10)

In [None]:
all_nouns_us.head(10)

In [None]:
df_us["Skill"] = 0
df_us.head(10)

In [None]:
for i in range(0,df_us.shape[0]):

  # for the number of job postings in California

  nouns_from_job_description = all_nouns_us.loc[all_nouns_us["Number"] == i, "Noun"]
  #print(nouns_from_job_description)
  extracted_skills = nouns_from_job_description.loc[nouns_from_job_description.isin(unique_skills).values == True]
  extracted_skills_unique = extracted_skills.unique()
  #print(extracted_skills_unique)
  df_us["Skill"][i] = extracted_skills_unique

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [None]:
df_us.to_csv("df_CA_2021-10-04_with_skills.csv")

In [None]:
df_us.head(10)

Unnamed: 0,Region,Job_title,Company,Company_Rating,Number_of_Reviews_of_the_Company,Company_Founded_Year,Company_Size,Company_Revenue,Industry,Location,Salary,Contract_Type,Description,Job_URL,Company_URL,Skill
0,California,"Data Scientist, Product Analytics - Notificati...",Facebook,4.1,632.0,2004.0,"more than 10,000",more than $10B (USD),Information Technology,"Sunnyvale, CA 94089",,,"Facebook's Growth team is a fast-paced, analyt...",https://www.indeed.com/rc/clk?jk=ba17f201153c5...,https://www.indeed.com/cmp/Facebook?campaignid...,"[quantitative analysis, data mining, presentat..."
1,California,Data Engineer,Visa,3.9,1005.0,1958.0,"more than 10,000",more than $10B (USD),Financial Services,"Foster City, CA",,,Company Description\n \nAs the world's leader ...,https://www.indeed.com/rc/clk?jk=18931c98b78d4...,https://www.indeed.com/cmp/Visa?campaignid=mob...,"[advanced global processing network, network, ..."
2,California,Senior Machine Learning Scientist,Atomwise,,,,,,,"San Francisco, CA",,,\n\n \nAtomwise is the leading artificial inte...,https://www.indeed.com/rc/clk?jk=da81deb6ca6fe...,,"[ phd, computational chemistry, generative mod..."
3,California,"Data Engineer/Scientist, Autonomous Systems",Apple,4.2,10451.0,1976.0,"more than 10,000",more than $10B (USD),Manufacturing,"Santa Clara Valley, CA 95014",,,"Summary\n \nPosted: \n Sep 23, 2021\n \nRole N...",https://www.indeed.com/rc/clk?jk=c39d2bae2d0ec...,https://www.indeed.com/cmp/Apple?campaignid=mo...,"[summary, software engineering, python, data s..."
4,California,Manager Data Analytics and Management,Amgen,4.0,1613.0,1980.0,"more than 10,000",more than $10B (USD),Health Care,"Thousand Oaks, CA",,,Career Category\n\n Supply Chain\n\n Job Descr...,https://www.indeed.com/rc/clk?jk=c6a808e406d55...,https://www.indeed.com/cmp/Amgen?campaignid=mo...,"[machine learning models, large datasets, anal..."
5,California,"Analyst/Data Scientist, Monetization Strategy",Salesforce,4.3,780.0,1999.0,"more than 10,000",more than $10B (USD),Information Technology,"San Francisco, CA 94105",,,"To get the best candidate experience, please c...",https://www.indeed.com/rc/clk?jk=2e98f5123c37e...,https://www.indeed.com/cmp/Salesforce?campaign...,"[product marketing, quantitative models, model..."
6,California,Principal Data Scientist,Northrop Grumman,4.0,6070.0,1939.0,"more than 10,000",more than $10B (USD),Aerospace & Defense,"Palmdale, CA 93550","$111,400 - $167,000 a year",Full-time,US CITIZENSHIP REQUIRED FOR THIS POSITION: Yes...,https://www.indeed.com/rc/clk?jk=fd2d686f645d8...,https://www.indeed.com/cmp/Northrop-Grumman?ca...,"[models, large data sets, data collection, cre..."
7,California,Sr. Decision Scientist,Sony Interactive Entertainment PlayStation,3.7,133.0,1994.0,"5,001 to 10,000",,Information Technology,"San Diego, CA",,,PlayStation isn't just the Best Place to Play ...,https://www.indeed.com/rc/clk?jk=06a362bcae0bb...,https://www.indeed.com/cmp/Playstation?campaig...,"[data science, data engineering, data driven s..."
8,California,Data Scientist (Marketing),Mattel,4.0,832.0,1945.0,"5,001 to 10,000",$1B to $5B (USD),Manufacturing,"El Segundo, CA 90245",,,CREATIVITY IS OUR SUPERPOWER.\n It’s our heri...,https://www.indeed.com/rc/clk?jk=7b7effd3e21f3...,"https://www.indeed.com/cmp/Mattel,-Inc.?campai...","[statistical analysis, catalog, statistical mo..."
9,California,"Senior Full Stack Engineer, Machine Learning P...",Albertsons Companies,3.4,136.0,,,,Retail & Wholesale,"Pleasanton, CA 94588",,,Albertsons Companies is one of the largest foo...,https://www.indeed.com/rc/clk?jk=5c43ad365da7a...,https://www.indeed.com/cmp/Albertsons-Companie...,"[art machine learning models, merchandising, m..."
