# Experiments

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [2]:
import re

def clean_text(
    string: str, 
    punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''',
    stop_words=['the', 'a', 'and', 'is', 'be', 'will']) -> str:
    """
    A method to clean text 
    """
    # Cleaning the urls
    string = re.sub(r'https?://\S+|www\.\S+', '', string)

    # Cleaning the html elements
    string = re.sub(r'<.*?>', '', string)

    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "") 

    # Converting the text to lower
    string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in stop_words])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string        

In [3]:
def create_embedding_matrix(tokenizer, max_features, vector_dimension, model_embed, word2vec):
    """
    A method to create the embedding matrix
    """

    embedding_matrix = np.zeros((max_features + 1, vector_dimension))
    for word, index in tokenizer.word_index.items():
        if index > max_features:
            break
        else:
            try:
                if word2vec :
                    embedding_matrix[index] = model_embed.wv[word]
                else : 
                    embedding_matrix[index] = model_embed[word]
            except:
                continue
    return embedding_matrix

In [4]:
def string_to_tensor(string_list: list, tokenizer, max_len) -> list:
    """
    A method to convert a string list to a tensor for a deep learning model
    """    
    string_list = tokenizer.texts_to_sequences(string_list)
    string_list = pad_sequences(string_list, maxlen=max_len)
    
    return string_list

In [5]:
desc = pd.read_pickle("data/jobDescription4.pkl")

In [6]:
desc["labels"] = np.where(desc["board"]=="web", 1, 0)
desc = desc[["Description_processed", "labels"]].sample(frac = 1).head(500)

desc["Description_processed"] = desc["Description_processed"].apply(lambda x: clean_text(x)) 

desc.to_pickle("experiment/exp.pkl")

In [7]:
from statistics import mean
data = pd.read_pickle('experiment/train.pkl')
np.random.seed(7)

In [26]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 3)

In [44]:
import gensim

#c_word2vec = gensim.models.Word2Vec.load("models/custom/Word2Vec.model")
#c_fasttext = gensim.models.Word2Vec.load("models/custom/FastText.model")
#c_glove = gensim.models.KeyedVectors.load_word2vec_format("models/custom/GloVe.txt")
#glove = gensim.models.KeyedVectors.load_word2vec_format("models/pre-trained/wvglove.6B.100d.txt")


#glove300 = gensim.models.KeyedVectors.load_word2vec_format("models/pre-trained/wvglove.42B.300d.txt")
#c_glove300 = gensim.models.KeyedVectors.load_word2vec_format("models/custom/GloVe_300d.txt")
c_word2vec300 = gensim.models.Word2Vec.load("models/custom/Word2Vec_300d.model")
c_fasttext300 = gensim.models.Word2Vec.load("models/custom/FastText_300d.model")

emb_dim = 300

In [45]:
c_word2vec300.wv.most_similar("python")

[('pandas', 0.6743684411048889),
 ('pyspark', 0.6333119869232178),
 ('scala', 0.6253077983856201),
 ('flask', 0.6234632730484009),
 ('sqlalchemy', 0.6211127042770386),
 ('numpy', 0.6177347302436829),
 ('tensorflow', 0.61696457862854),
 ('keras', 0.6124801635742188),
 ('ggplot', 0.605608344078064),
 ('matlab', 0.5970077514648438)]

In [51]:
# Deep learning: 
from keras.models import Input, Model
from keras.layers import LSTM, Dense, Embedding, Dropout, GlobalAveragePooling1D, Bidirectional, InputLayer
from tensorflow import keras

def RnnModel(embedding_matrix, embedding_dim, max_len):
    inp1 = Input(shape=(max_len,))
    x = Embedding(embedding_matrix.shape[0], embedding_dim, weights=[embedding_matrix])(inp1)
    x = Bidirectional(LSTM(256, return_sequences=True))(x)
    x = Bidirectional(LSTM(150))(x)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(64, activation="relu")(x)
    x = Dense(1, activation="sigmoid")(x)    
    model = Model(inputs=inp1, outputs=x)

    model.compile(loss = 'binary_crossentropy', optimizer = 'adam')
    return model

def SimpleModel(embedding_matrix, embedding_dim, max_len):
    model = keras.Sequential()
    model.add(keras.layers.Embedding(embedding_matrix.shape[0], embedding_dim, weights=[embedding_matrix]))
    #model.add(keras.layers.Embedding(embedding_matrix.shape[0], embedding_dim))
    model.add(keras.layers.GlobalAveragePooling1D())
    model.add(keras.layers.Dense(16, activation="relu"))
    model.add(keras.layers.Dense(1, activation="sigmoid"))

    model.compile(loss = 'binary_crossentropy', optimizer = 'adam')
    return model

In [53]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import accuracy_score

accuracies = []
for train_index, val_index in kf.split(data):
    training_data = data.iloc[train_index]
    validation_data = data.iloc[val_index]

    X_train = training_data['Description_processed'].tolist()
    Y_train = np.asarray(training_data['labels'].tolist())

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)

    emb_matrix = create_embedding_matrix(tokenizer, len(tokenizer.word_counts), emb_dim, c_fasttext300, False)
    print(emb_matrix.shape)
        
    max_len = np.max([len(text.split()) for text in X_train])
    X_train = string_to_tensor(X_train, tokenizer, max_len)

    model = SimpleModel(emb_matrix, emb_dim, max_len)
    print(model.summary())
"""
    callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)

    batch_size = 32
    epochs = 15

    model.fit(
        X_train,
        Y_train, 
        batch_size=batch_size, 
        epochs=epochs,
        callbacks=[callback]
    )

    X_test = validation_data['Description_processed'].tolist()
    Y_test = np.asarray(validation_data['labels'].tolist())
    
    X_test = [clean_text(text) for text in X_test]
    X_test = string_to_tensor(X_test, tokenizer, max_len)
    yhat = [x[0] for x in model.predict(X_test).tolist()]

    acc = accuracy_score(Y_test, [1 if x > 0.5 else 0 for x in yhat])
    accuracies.append(acc)
    keras.backend.clear_session()

print(accuracies)
mean(accuracies)
"""

(32470, 300)
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 300)         9741000   
_________________________________________________________________
global_average_pooling1d_5 ( (None, 300)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 16)                4816      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 17        
Total params: 9,745,833
Trainable params: 9,745,833
Non-trainable params: 0
_________________________________________________________________
None
(34152, 300)
Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 300)         10

"\n    callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)\n\n    batch_size = 32\n    epochs = 15\n\n    model.fit(\n        X_train,\n        Y_train, \n        batch_size=batch_size, \n        epochs=epochs,\n        callbacks=[callback]\n    )\n\n    X_test = validation_data['Description_processed'].tolist()\n    Y_test = np.asarray(validation_data['labels'].tolist())\n    \n    X_test = [clean_text(text) for text in X_test]\n    X_test = string_to_tensor(X_test, tokenizer, max_len)\n    yhat = [x[0] for x in model.predict(X_test).tolist()]\n\n    acc = accuracy_score(Y_test, [1 if x > 0.5 else 0 for x in yhat])\n    accuracies.append(acc)\n    keras.backend.clear_session()\n\nprint(accuracies)\nmean(accuracies)\n"

In [54]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, None, 300)         10319100  
_________________________________________________________________
global_average_pooling1d_7 ( (None, 300)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 16)                4816      
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 17        
Total params: 10,323,933
Trainable params: 10,323,933
Non-trainable params: 0
_________________________________________________________________
