In [1]:
from google.colab import files
uploaded = files.upload()

Saving prj3_cleaned_data.csv to prj3_cleaned_data.csv


In [2]:
import numpy as np
import pandas as pd

data = pd.read_csv("prj3_cleaned_data.csv")
data.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 ct bauer college of business graduate mag...,houston texas,85,
1,2,native english teacher at epik english program...,kanada,500+,
2,3,aspiring human resources professional,raleigh-durham north carolina area,44,
3,4,people development coordinator at ryan,denton texas,500+,
4,5,advisory board member at celal bayar university,i̇zmir türkiye,500+,


# [Bag-of-Words Model](https://spotintelligence.com/2022/12/20/bag-of-words-python/)


### Extract words

In [3]:
# joining all excisting columns
data_concat = pd.DataFrame()
data_concat["data_concat"] = data[["job_title", "location", "connection"]].apply(" ".join, axis=1)

data_concat.head()

Unnamed: 0,data_concat
0,2019 ct bauer college of business graduate mag...
1,native english teacher at epik english program...
2,aspiring human resources professional raleigh-...
3,people development coordinator at ryan denton ...
4,advisory board member at celal bayar universit...


In [4]:
# making dataframe into a list of srings
data_concat_list = data_concat["data_concat"]
data_concat_list = data_concat_list.tolist()
print(data_concat_list)

['2019 ct bauer college of business graduate magna cum laude and aspiring human resources professional houston texas 85', 'native english teacher at epik english program in korea kanada 500+ ', 'aspiring human resources professional raleigh-durham north carolina area 44', 'people development coordinator at ryan denton texas 500+ ', 'advisory board member at celal bayar university i̇zmir türkiye 500+ ', 'aspiring human resources specialist greater new york city area 1', 'student at humber college and aspiring human resources generalist kanada 61', 'hr senior specialist san francisco bay area 500+ ', 'student at humber college and aspiring human resources generalist kanada 61', 'seeking human resources hris and generalist positions greater philadelphia area 500+ ', 'student at chapman university lake forest california 2', 'svp chro marketing & communications csr officer  engie  houston  the woodlands  energy  gphr  sphr houston texas area 500+ ', 'human resources coordinator at intercont

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_1 = CountVectorizer(ngram_range=(1,3), max_features=200, stop_words='english')
vectorizer_1.fit(data_concat_list)

print(sorted(vectorizer_1.vocabulary_))

['2019', '2019 ct', '2019 ct bauer', '44', '500', '61', '85', 'area', 'area 44', 'area 500', 'aspiring', 'aspiring human', 'aspiring human resources', 'atlanta', 'bauer', 'bauer college', 'bauer college business', 'bay', 'bay area', 'bay area 500', 'business', 'business graduate', 'business graduate magna', 'california', 'carolina', 'carolina area', 'carolina area 44', 'city', 'city area', 'college', 'college aspiring', 'college aspiring human', 'college business', 'college business graduate', 'communications', 'communications csr', 'communications csr officer', 'coordinator', 'coordinator intercontinental', 'coordinator intercontinental buckhead', 'coordinator ryan', 'coordinator ryan denton', 'ct', 'ct bauer', 'ct bauer college', 'cum', 'cum laude', 'cum laude aspiring', 'denton', 'denton texas', 'denton texas 500', 'development', 'development coordinator', 'development coordinator ryan', 'durham', 'durham north', 'durham north carolina', 'energy gphr', 'energy gphr sphr', 'engie', '

### Score words and create vector

In [6]:
vector_1 = vectorizer_1.transform(data_concat_list)
vector_bow = vector_1.toarray()
print(vector_bow.shape)
print(vector_bow)

(104, 200)
[[1 1 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


###Labeling Data

In [7]:
# list to store indices of matching job titles
matching_indices = []

# patterns to search for
patterns = ["aspiring human resources", "seeking human resources"]

# iterate through each job title
for idx, title in enumerate(data_concat_list):
    if any(pattern in title.lower() for pattern in patterns):
        matching_indices.append(idx)

# Get the rows with matching job titles
matching_profiles = data.iloc[matching_indices]

print(matching_profiles)

     id                                          job_title  \
0     1  2019 ct bauer college of business graduate mag...   
2     3              aspiring human resources professional   
5     6                aspiring human resources specialist   
6     7  student at humber college and aspiring human r...   
8     9  student at humber college and aspiring human r...   
9    10  seeking human resources hris and generalist po...   
13   14  2019 ct bauer college of business graduate mag...   
14   15  2019 ct bauer college of business graduate mag...   
16   17              aspiring human resources professional   
18   19  2019 ct bauer college of business graduate mag...   
20   21              aspiring human resources professional   
23   24                aspiring human resources specialist   
24   25  student at humber college and aspiring human r...   
26   27  aspiring human resources management student se...   
27   28              seeking human resources opportunities   
28   29 

In [8]:
print(matching_profiles.shape)

(43, 5)


In [9]:
data.iloc[[matching_indices],[4]] = 1
data['fit'] = data['fit'].fillna(0)
print(data['fit'])
print(data['fit'].value_counts())

0      1.0
1      0.0
2      1.0
3      0.0
4      0.0
      ... 
99     1.0
100    0.0
101    0.0
102    0.0
103    0.0
Name: fit, Length: 104, dtype: float64
0.0    61
1.0    43
Name: fit, dtype: int64


In [10]:
# transforming target data to array
target = data['fit']
target.to_numpy()

target_array = target.values.reshape(-1, 1)

In [11]:
# putting the two arrays together
complete_array_bow = np.append(vector_bow, target_array, axis=1)

print(complete_array_bow)
print(complete_array_bow.shape)

[[1. 1. 1. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(104, 201)


###Splitting the Data

In [12]:
from sklearn.model_selection import train_test_split
X = np.delete(complete_array_bow, [200], axis=1)
y = complete_array_bow[:, [200]]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=0
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=0
)

In [None]:
print("y_val: ",y_val)
print("y_test: ",y_test)

##Classification Models

###Logistic Regression Model

####Model Training

In [14]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

lr_model = LogisticRegression(random_state=0)
cv_scores_lr = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='accuracy')
lr_model.fit(X_train, y_train)

print("Cross Validation Scores: ", cv_scores_lr)

Cross Validation Scores:  [0.93333333 1.         1.         0.85714286 1.        ]


####Hyperparameter Tuning

In [15]:
from sklearn.model_selection import GridSearchCV

solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['l1', 'l2', 'elasticnet']
penalty.append(None)
C = [10, 1, 0.1, 0.01]
random_state = [0]

param_grid_lr = {'solver': solver,
                 'penalty': penalty,
                 'C': C,
                 'random_state': random_state}
base_model_lr = LogisticRegression()
grid_search_lr = GridSearchCV(estimator=base_model_lr, param_grid=param_grid_lr, cv=5, scoring='accuracy')

grid_search_lr.fit(X_train, y_train)
best_params_lr = grid_search_lr.best_params_
print("best parameters: ",best_params_lr)

best parameters:  {'C': 1, 'penalty': 'l1', 'random_state': 0, 'solver': 'liblinear'}


####Evaluating Model on Validation Set

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score


tuned_model_lr = LogisticRegression(**best_params_lr)
cv_scores_tuned_lr = cross_val_score(tuned_model_lr, X_train, y_train, cv=5, scoring='accuracy')
tuned_model_lr.fit(X_train, y_train)
y_pred_lr_val = tuned_model_lr.predict(X_val)

accuracy_lr_val = accuracy_score(y_val, y_pred_lr_val)
precision_lr_val = precision_score(y_val, y_pred_lr_val)
recall_lr_val = recall_score(y_val, y_pred_lr_val)

F1_lr_val = 2 * (precision_lr_val * recall_lr_val) / (precision_lr_val + recall_lr_val)

print("Cross Validation Scores (for tuned model): ",cv_scores_tuned_lr)
print("F1: ",F1_lr_val)
print("Accuracy: ",accuracy_lr_val)

Cross Validation Scores (for tuned model):  [0.93333333 1.         1.         1.         1.        ]
F1:  1.0
Accuracy:  1.0


####Model Evaluation on Test Set

In [17]:
y_pred_lr_test = tuned_model_lr.predict(X_test)

accuracy_lr_test = accuracy_score(y_test, y_pred_lr_test)
precision_lr_test = precision_score(y_test, y_pred_lr_test)
recall_lr_test = recall_score(y_test, y_pred_lr_test)

F1_lr_test = 2 * (precision_lr_test * recall_lr_test) / (precision_lr_test + recall_lr_test)
print("F1: ",F1_lr_test)
print("Accuracy: ",accuracy_lr_test)

F1:  1.0
Accuracy:  1.0


###SVM Model

####Model Training

In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

svm_model = SVC(random_state=0)
cv_scores_svm = cross_val_score(svm_model, X_train, y_train, cv=5, scoring='accuracy')
svm_model.fit(X_train, y_train)

print("Cross Validation Scores:", cv_scores_svm)

Cross Validation Scores: [0.93333333 1.         1.         0.85714286 1.        ]


####Hyperparameter Tuning

In [19]:
from sklearn.model_selection import GridSearchCV

C = [0.1, 1, 10]
gamma = ['scale', 'auto', 1, 0.1, 0.01]
kernel = ['rbf', 'poly', 'sigmoid']
random_state = [0]

param_grid_svm = {'C': C,
                  'gamma': gamma,
                  'kernel': kernel,
                  'random_state': random_state}
base_model_svm = SVC()
grid_search_svm = GridSearchCV(estimator=base_model_svm, param_grid=param_grid_svm, cv=5, scoring='accuracy')

grid_search_svm.fit(X_train, y_train)
best_params_svm = grid_search_svm.best_params_
print("best parameters: ",best_params_svm)

best parameters:  {'C': 10, 'gamma': 0.1, 'kernel': 'sigmoid', 'random_state': 0}


####Model Evaluation on Validation Set

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

tuned_model_svm = SVC(**best_params_svm)
cv_scores_tuned_svm = cross_val_score(tuned_model_svm, X_train, y_train, cv=5, scoring='accuracy')
tuned_model_svm.fit(X_train, y_train)
y_pred_svm_val = tuned_model_svm.predict(X_val)

accuracy_svm_val = accuracy_score(y_val, y_pred_svm_val)
precision_svm_val = precision_score(y_val, y_pred_svm_val)
recall_svm_val = recall_score(y_val, y_pred_svm_val)

F1_svm_val = 2 * (precision_svm_val * recall_svm_val) / (precision_svm_val + recall_svm_val)

print("Cross Validation Scores (for tuned model): ",cv_scores_tuned_svm)
print("F1: ",F1_svm_val)
print("Accuracy: ",accuracy_svm_val)

Cross Validation Scores (for tuned model):  [0.93333333 1.         1.         1.         1.        ]
F1:  1.0
Accuracy:  1.0


####Model Evaluation on Test Set

In [21]:
y_pred_svm_test = tuned_model_svm.predict(X_test)

accuracy_svm_test = accuracy_score(y_test, y_pred_svm_test)
precision_svm_test = precision_score(y_test, y_pred_svm_test)
recall_svm_test = recall_score(y_test, y_pred_svm_test)

F1_svm_test = 2 * (precision_svm_test * recall_svm_test) / (precision_svm_test + recall_svm_test)
print("F1: ",F1_svm_test)
print("Accuracy: ",accuracy_svm_test)

F1:  1.0
Accuracy:  1.0


###Naive Bayes Model

####Model Training

In [22]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

nb_model = GaussianNB()
cv_scores_nb = cross_val_score(nb_model, X_train, y_train, cv=5, scoring='accuracy')
nb_model.fit(X_train, y_train)

print("Cross Validation Scores:", cv_scores_nb)

Cross Validation Scores: [0.86666667 1.         0.78571429 0.85714286 0.78571429]


####Hyperparameter Tuning

In [23]:
from sklearn.model_selection import GridSearchCV

var_smoothing = np.logspace(0,-9, num=100)

param_grid_nb = {'var_smoothing': var_smoothing}
base_model_nb = GaussianNB()
grid_search_nb = GridSearchCV(estimator=base_model_nb, param_grid=param_grid_nb, cv=5, scoring='accuracy')

grid_search_nb.fit(X_train, y_train)
best_params_nb = grid_search_nb.best_params_
print("best parameters: ",best_params_nb)

best parameters:  {'var_smoothing': 0.8111308307896871}


####Evaluating Model on Validation Set

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

tuned_model_nb = GaussianNB(**best_params_nb)
cv_scores_tuned_nb = cross_val_score(tuned_model_nb, X_train, y_train, cv=5, scoring='accuracy')
tuned_model_nb.fit(X_train, y_train)
y_pred_nb_val = tuned_model_nb.predict(X_val)

accuracy_nb_val = accuracy_score(y_val, y_pred_nb_val)
precision_nb_val = precision_score(y_val, y_pred_nb_val)
recall_nb_val = recall_score(y_val, y_pred_nb_val)

F1_nb_val = 2 * (precision_nb_val * recall_nb_val) / (precision_nb_val + recall_nb_val)

print("Cross Validation Scores (for tuned model): ",cv_scores_tuned_nb)
print("F1: ",F1_nb_val)
print("Accuracy: ",accuracy_nb_val)

Cross Validation Scores (for tuned model):  [0.93333333 1.         1.         0.85714286 0.78571429]
F1:  0.9411764705882353
Accuracy:  0.9375


####Evaluating Model on Test Set

In [25]:
y_pred_nb_test = tuned_model_nb.predict(X_test)

accuracy_nb_test = accuracy_score(y_test, y_pred_nb_test)
precision_nb_test = precision_score(y_test, y_pred_nb_test)
recall_nb_test = recall_score(y_test, y_pred_nb_test)

F1_nb_test = 2 * (precision_nb_test * recall_nb_test) / (precision_nb_test + recall_nb_test)
print("F1: ",F1_nb_test)
print("Accuracy: ",accuracy_nb_test)

F1:  1.0
Accuracy:  1.0


#[TF IDF Model](https://www.geeksforgeeks.org/understanding-tf-idf-term-frequency-inverse-document-frequency/)

###Get TF IDF Values

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')

# get tf-idf values using string created for b.o.w.
tfidf_value = tfidf.fit_transform(data_concat_list)

terms = tfidf.get_feature_names_out()
idf_scores = tfidf.idf_
terms_idf_scores = list(zip(terms, idf_scores))
terms_idf_scores.sort(key=lambda x: x[1], reverse=True)

# get idf values
print('\nidf values:')
for ele1, ele2 in terms_idf_scores:
    print(ele1, ':', ele2)

# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

# display tf-idf values
print('\ntf-idf value:')
print(tfidf_value)

In [None]:
# TF IDF values in descending order
tfidf_scores = tfidf_value.max(0).toarray()[0]
tfidf_dict = dict(zip(terms, tfidf_scores))

sorted_tfidf = sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)
for term, score in sorted_tfidf:
    print(f'Term: {term}, TF-IDF Score: {score}')

In [28]:
# TF IDF values matrix
tfidf_matrix = tfidf_value.toarray()
print('\ntf-idf values in matrix form:')
print(tfidf_matrix)
print(tfidf_matrix.shape)


tf-idf values in matrix form:
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.27717586 0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
(104, 271)


###Labeling Data

In [29]:
# adding target array to TF IDF matrix
complete_array_tfidf = np.append(tfidf_matrix, target_array, axis=1)

print(complete_array_tfidf)
print(complete_array_tfidf.shape)

[[0.         0.         0.         ... 0.         0.         1.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]
 ...
 [0.         0.         0.         ... 0.27717586 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
(104, 272)


###Splitting the Data

In [30]:
from sklearn.model_selection import train_test_split
X_tfidf = np.delete(complete_array_tfidf, [271], axis=1)
y_tfidf = complete_array_tfidf[:, [271]]

X_train_tfidf, X_temp_tfidf, y_train_tfidf, y_temp_tfidf = train_test_split(
    X_tfidf, y_tfidf, test_size=0.30, random_state=0
)
X_val_tfidf, X_test_tfidf, y_val_tfidf, y_test_tfidf = train_test_split(
    X_temp_tfidf, y_temp_tfidf, test_size=0.5, random_state=0
)

In [None]:
print("y_val: ",y_val_tfidf)
print("y_test: ",y_test_tfidf)

##Classification Models

###Logistic Regression

####Model Training

In [32]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

lr_model = LogisticRegression(random_state=0)
cv_scores_lr_tfidf = cross_val_score(lr_model, X_train_tfidf, y_train_tfidf, cv=5, scoring='accuracy')
lr_model.fit(X_train_tfidf, y_train_tfidf)

print("Cross Validation Scores: ", cv_scores_lr_tfidf)

Cross Validation Scores:  [0.86666667 0.8        0.78571429 0.85714286 0.71428571]


####Hyperparameter Tuning

In [33]:
from sklearn.model_selection import GridSearchCV

solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['l1', 'l2', 'elasticnet']
penalty.append(None)
C = [10, 1, 0.1, 0.01]
random_state = [0]

param_grid_lr = {'solver': solver,
                 'penalty': penalty,
                 'C': C,
                 'random_state': random_state}
base_model_lr = LogisticRegression()
grid_search_lr = GridSearchCV(estimator=base_model_lr, param_grid=param_grid_lr, cv=5, scoring='accuracy')

grid_search_lr.fit(X_train_tfidf, y_train_tfidf)
best_params_lr_tfidf = grid_search_lr.best_params_
print("best parameters: ",best_params_lr_tfidf)

best parameters:  {'C': 10, 'penalty': 'l1', 'random_state': 0, 'solver': 'liblinear'}


####Evaluating Model on Validation Set

In [34]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

tuned_model_lr_tfidf = LogisticRegression(**best_params_lr_tfidf)
cv_scores_tuned_lr_tfidf = cross_val_score(tuned_model_lr_tfidf, X_train_tfidf, y_train_tfidf, cv=5, scoring='accuracy')
tuned_model_lr_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_lr_val_tfidf = tuned_model_lr_tfidf.predict(X_val_tfidf)

accuracy_lr_val_tfidf = accuracy_score(y_val_tfidf, y_pred_lr_val_tfidf)
precision_lr_val_tfidf = precision_score(y_val_tfidf, y_pred_lr_val_tfidf)
recall_lr_val_tfidf = recall_score(y_val_tfidf, y_pred_lr_val_tfidf)

F1_lr_val_tfidf = 2 * (precision_lr_val_tfidf * recall_lr_val_tfidf) / (precision_lr_val_tfidf + recall_lr_val_tfidf)

print("Cross Validation Scores (for tuned model): ",cv_scores_tuned_lr_tfidf)
print("F1: ",F1_lr_val_tfidf)
print("Accuracy: ",accuracy_lr_val_tfidf)

Cross Validation Scores (for tuned model):  [1.         1.         1.         0.92857143 1.        ]
F1:  0.9411764705882353
Accuracy:  0.9375


####Evaluation Model on Test Set

In [35]:
y_pred_lr_test_tfidf = tuned_model_lr_tfidf.predict(X_test_tfidf)

accuracy_lr_test_tfidf = accuracy_score(y_test_tfidf, y_pred_lr_test_tfidf)
precision_lr_test_tfidf = precision_score(y_test_tfidf, y_pred_lr_test_tfidf)
recall_lr_test_tfidf = recall_score(y_test_tfidf, y_pred_lr_test_tfidf)

F1_lr_test_tfidf = 2 * (precision_lr_test_tfidf * recall_lr_test_tfidf) / (precision_lr_test_tfidf + recall_lr_test_tfidf)
print("F1: ",F1_lr_test_tfidf)
print("Accuracy: ",accuracy_lr_test_tfidf)

F1:  1.0
Accuracy:  1.0


###SVM Model

####Model Training

In [36]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

svm_model = SVC(random_state=0)
cv_scores_svm_tfidf = cross_val_score(svm_model, X_train_tfidf, y_train_tfidf, cv=5, scoring='accuracy')
svm_model.fit(X_train_tfidf, y_train_tfidf)

print("Cross Validation Scores:", cv_scores_svm_tfidf)

Cross Validation Scores: [0.93333333 0.8        0.92857143 0.92857143 0.64285714]


####Hyperparameter Tuning

In [37]:
from sklearn.model_selection import GridSearchCV

C = [0.1, 1, 10]
gamma = ['scale', 'auto', 1, 0.1, 0.01]
kernel = ['rbf', 'poly', 'sigmoid']
random_state = [0]

param_grid_svm = {'C': C,
                  'gamma': gamma,
                  'kernel': kernel,
                  'random_state': random_state}
base_model_svm = SVC()
grid_search_svm = GridSearchCV(estimator=base_model_svm, param_grid=param_grid_svm, cv=5, scoring='accuracy')

grid_search_svm.fit(X_train_tfidf, y_train_tfidf)
best_params_svm_tfidf = grid_search_svm.best_params_
print("best parameters: ",best_params_svm_tfidf)

best parameters:  {'C': 1, 'gamma': 'scale', 'kernel': 'poly', 'random_state': 0}


####Evaluating Model on Validation Set

In [38]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

tuned_model_svm_tfidf = SVC(**best_params_svm_tfidf)
cv_scores_tuned_svm_tfidf = cross_val_score(tuned_model_svm_tfidf, X_train_tfidf, y_train_tfidf, cv=5, scoring='accuracy')
tuned_model_svm_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_svm_val_tfidf = tuned_model_svm_tfidf.predict(X_val_tfidf)

accuracy_svm_val_tfidf = accuracy_score(y_val_tfidf, y_pred_svm_val_tfidf)
precision_svm_val_tfidf = precision_score(y_val_tfidf, y_pred_svm_val_tfidf)
recall_svm_val_tfidf = recall_score(y_val_tfidf, y_pred_svm_val_tfidf)

F1_svm_val_tfidf = 2 * (precision_svm_val_tfidf * recall_svm_val_tfidf) / (precision_svm_val_tfidf + recall_svm_val_tfidf)

print("Cross Validation Scores (for tuned model): ",cv_scores_tuned_svm_tfidf)
print("F1: ",F1_svm_val_tfidf)
print("Accuracy: ",accuracy_svm_val_tfidf)

Cross Validation Scores (for tuned model):  [0.93333333 1.         0.92857143 0.92857143 0.78571429]
F1:  0.8750000000000001
Accuracy:  0.875


####Evaluating Model on Test Set

In [39]:
y_pred_svm_test_tfidf = tuned_model_svm_tfidf.predict(X_test_tfidf)

accuracy_svm_test_tfidf = accuracy_score(y_test_tfidf, y_pred_svm_test_tfidf)
precision_svm_test_tfidf = precision_score(y_test_tfidf, y_pred_svm_test_tfidf)
recall_svm_test_tfidf = recall_score(y_test_tfidf, y_pred_svm_test_tfidf)

F1_svm_test_tfidf = 2 * (precision_svm_test_tfidf * recall_svm_test_tfidf) / (precision_svm_test_tfidf + recall_svm_test_tfidf)
print("F1: ",F1_svm_test_tfidf)
print("Accuracy: ",accuracy_svm_test_tfidf)

F1:  0.8750000000000001
Accuracy:  0.875


###Naive Bayes

####Model Training

In [40]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

nb_model = GaussianNB()
cv_scores_nb_tfidf = cross_val_score(nb_model, X_train_tfidf, y_train_tfidf, cv=5, scoring='accuracy')
nb_model.fit(X_train_tfidf, y_train_tfidf)

print("Cross Validation Scores:", cv_scores_nb_tfidf)

Cross Validation Scores: [0.86666667 0.93333333 0.71428571 0.92857143 0.64285714]


####Hyperparameter Tuning

In [41]:
from sklearn.model_selection import GridSearchCV

var_smoothing = np.logspace(0,-9, num=100)

param_grid_nb = {'var_smoothing': var_smoothing}
base_model_nb = GaussianNB()
grid_search_nb = GridSearchCV(estimator=base_model_nb, param_grid=param_grid_nb, cv=5, scoring='accuracy')

grid_search_nb.fit(X_train_tfidf, y_train_tfidf)
best_params_nb_tfidf = grid_search_nb.best_params_
print("best parameters: ",best_params_nb_tfidf)

best parameters:  {'var_smoothing': 0.005336699231206307}


####Evaluating Model on Validation Set

In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

tuned_model_nb_tfidf = GaussianNB(**best_params_nb_tfidf)
cv_scores_tuned_nb_tfidf = cross_val_score(tuned_model_nb_tfidf, X_train_tfidf, y_train_tfidf, cv=5, scoring='accuracy')
tuned_model_nb_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_nb_val_tfidf = tuned_model_nb_tfidf.predict(X_val_tfidf)

accuracy_nb_val_tfidf = accuracy_score(y_val_tfidf, y_pred_nb_val_tfidf)
precision_nb_val_tfidf = precision_score(y_val_tfidf, y_pred_nb_val_tfidf)
recall_nb_val_tfidf = recall_score(y_val_tfidf, y_pred_nb_val_tfidf)

F1_nb_val_tfidf = 2 * (precision_nb_val_tfidf * recall_nb_val_tfidf) / (precision_nb_val_tfidf + recall_nb_val_tfidf)

print("Cross Validation Scores (for tuned model): ",cv_scores_tuned_nb_tfidf)
print("F1: ",F1_nb_val_tfidf)
print("Accuracy: ",accuracy_nb_val_tfidf)

Cross Validation Scores (for tuned model):  [0.86666667 0.93333333 0.71428571 0.92857143 0.64285714]
F1:  0.8421052631578948
Accuracy:  0.8125


####Evaluating Model on Test Set

In [43]:
y_pred_nb_test_tfidf = tuned_model_nb_tfidf.predict(X_test_tfidf)

accuracy_nb_test_tfidf = accuracy_score(y_test_tfidf, y_pred_nb_test_tfidf)
precision_nb_test_tfidf = precision_score(y_test_tfidf, y_pred_nb_test_tfidf)
recall_nb_test_tfidf = recall_score(y_test_tfidf, y_pred_nb_test_tfidf)

F1_nb_test_tfidf = 2 * (precision_nb_test_tfidf * recall_nb_test_tfidf) / (precision_nb_test_tfidf + recall_nb_test_tfidf)
print("F1: ",F1_nb_test_tfidf)
print("Accuracy: ",accuracy_nb_test_tfidf)

F1:  0.8421052631578948
Accuracy:  0.8125


# [Word2Vec Model](https://medium.com/@dilip.voleti/classification-using-word2vec-b1d79d375381)

###Preprocessing for Word2Vec

In [None]:
import nltk
nltk.download()

In [66]:
data_concat_target = pd.DataFrame()
data_concat_target["text"] = data_concat["data_concat"]
data_concat_target["target"] = target

data_concat_target.head()

Unnamed: 0,text,target
0,2019 ct bauer college of business graduate mag...,1.0
1,native english teacher at epik english program...,0.0
2,aspiring human resources professional raleigh-...,1.0
3,people development coordinator at ryan denton ...,0.0
4,advisory board member at celal bayar universit...,0.0


In [67]:
# remove all punctuation, remove stop words and tokenize using built in data cleaner in gensim
import gensim

data_concat_target['clean_text'] = data_concat_target['text'].apply(lambda x: gensim.parsing.preprocessing.remove_stopwords(x),
                                                                    lambda x: gensim.parsing.preprocessing.remove_strip_punctuation(x))

data_concat_target.head()

Unnamed: 0,text,target,clean_text
0,2019 ct bauer college of business graduate mag...,1.0,2019 ct bauer college business graduate magna ...
1,native english teacher at epik english program...,0.0,native english teacher epik english program ko...
2,aspiring human resources professional raleigh-...,1.0,aspiring human resources professional raleigh-...
3,people development coordinator at ryan denton ...,0.0,people development coordinator ryan denton tex...
4,advisory board member at celal bayar universit...,0.0,advisory board member celal bayar university i...


In [68]:
from gensim.parsing.preprocessing import preprocess_string

data_concat_target['processed_text'] = [preprocess_string(sentence, [lambda x: x.lower()]) for sentence in data_concat_target['clean_text']]
data_concat_target.head()

Unnamed: 0,text,target,clean_text,processed_text
0,2019 ct bauer college of business graduate mag...,1.0,2019 ct bauer college business graduate magna ...,"[2019, ct, bauer, college, business, graduate,..."
1,native english teacher at epik english program...,0.0,native english teacher epik english program ko...,"[native, english, teacher, epik, english, prog..."
2,aspiring human resources professional raleigh-...,1.0,aspiring human resources professional raleigh-...,"[aspiring, human, resources, professional, ral..."
3,people development coordinator at ryan denton ...,0.0,people development coordinator ryan denton tex...,"[people, development, coordinator, ryan, dento..."
4,advisory board member at celal bayar universit...,0.0,advisory board member celal bayar university i...,"[advisory, board, member, celal, bayar, univer..."


###Splitting the Data

In [69]:
from sklearn.model_selection import train_test_split

X_w2v = data_concat_target['processed_text']
y_w2v = data_concat_target['target']

X_train_w2v, X_temp_w2v, y_train_w2v, y_temp_w2v = train_test_split(
    X_w2v, y_w2v, test_size=0.20, random_state=0
)
X_val_w2v, X_test_w2v, y_val_w2v, y_test_w2v = train_test_split(
    X_temp_w2v, y_temp_w2v, test_size=0.5, random_state=0
)

In [70]:
from gensim.models import Word2Vec

# train word2vec model
w2v_model = Word2Vec(X_train_w2v, min_count=2)

vocab_w2v = w2v_model.wv.key_to_index
print(vocab_w2v)

{'human': 0, 'resources': 1, 'area': 2, '500+': 3, 'aspiring': 4, 'texas': 5, 'greater': 6, 'houston': 7, 'seeking': 8, 'student': 9, 'professional': 10, 'english': 11, 'generalist': 12, 'university': 13, 'atlanta': 14, 'coordinator': 15, 'kanada': 16, 'specialist': 17, 'california': 18, 'york': 19, 'new': 20, 'college': 21, '&': 22, 'business': 23, 'north': 24, 'city': 25, 'carolina': 26, 'manager': 27, 'epik': 28, 'raleigh-durham': 29, 'people': 30, 'development': 31, 'ryan': 32, 'management': 33, 'denton': 34, 'san': 35, 'teacher': 36, 'hr': 37, 'senior': 38, 'korea': 39, 'native': 40, 'program': 41, 'energy': 42, 'officer': 43, 'engie': 44, 'gphr': 45, 'sphr': 46, 'csr': 47, 'svp': 48, 'communications': 49, 'marketing': 50, 'chro': 51, 'woodlands': 52, 'opportunities': 53, 'beach': 54, 'philadelphia': 55, '61': 56, 'humber': 57, '1': 58, 'francisco': 59, 'bay': 60, 'intercontinental': 61, 'buckhead': 62, 'georgia': 63, 'chapman': 64, 'director': 65, 'illinois': 66, 'cum': 67, 'posi

###Aggregated Sentence Vectors

In [50]:
import warnings
warnings.filterwarnings("ignore")

words = set(w2v_model.wv.index_to_key )
X_train_w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train_w2v])
X_val_w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_val_w2v])
X_test_w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test_w2v])

In [None]:
for i, v in enumerate(X_train_w2v_vect):
    print(len(X_train_w2v.iloc[i]), len(v))

In [52]:
X_train_w2v_vect_avg = []
for v in X_train_w2v_vect:
    if v.size:
        X_train_w2v_vect_avg.append(v.mean(axis=0))
    else:
        X_train_w2v_vect_avg.append(np.zeros(100, dtype=float))

X_val_w2v_vect_avg = []
for v in X_val_w2v_vect:
    if v.size:
        X_val_w2v_vect_avg.append(v.mean(axis=0))
    else:
        X_val_w2v_vect_avg.append(np.zeros(100, dtype=float))

X_test_w2v_vect_avg = []
for v in X_test_w2v_vect:
    if v.size:
        X_test_w2v_vect_avg.append(v.mean(axis=0))
    else:
        X_test_w2v_vect_avg.append(np.zeros(100, dtype=float))

In [None]:
for i, v in enumerate(X_train_w2v_vect_avg):
    print(len(X_train_w2v.iloc[i]), len(v))

##Classification Models

###Logistic Regression

####Model Training

In [71]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

lr_model = LogisticRegression(random_state=0)
cv_scores_lr_w2v = cross_val_score(lr_model, X_train_w2v_vect_avg, y_train_w2v, cv=5, scoring='accuracy')
lr_model.fit(X_train_w2v_vect_avg, y_train_w2v)

print("Cross Validation Scores: ", cv_scores_lr_w2v)

Cross Validation Scores:  [0.64705882 0.64705882 0.64705882 0.6875     0.6875    ]


####Hyperparameter Tuning

In [55]:
from sklearn.model_selection import GridSearchCV

solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['l1', 'l2', 'elasticnet']
penalty.append(None)
C = [10, 1, 0.1, 0.01]
random_state = [0]

param_grid_lr = {'solver': solver,
                 'penalty': penalty,
                 'C': C,
                 'random_state': random_state}
base_model_lr = LogisticRegression()
grid_search_lr = GridSearchCV(estimator=base_model_lr, param_grid=param_grid_lr, cv=5, scoring='accuracy')

grid_search_lr.fit(X_train_w2v_vect_avg, y_train_w2v)
best_params_lr_w2v = grid_search_lr.best_params_
print("best parameters: ",best_params_lr_w2v)

best parameters:  {'C': 10, 'penalty': None, 'random_state': 0, 'solver': 'newton-cg'}


####Evaluating Model on Validation Set

In [56]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

tuned_model_lr_w2v = LogisticRegression(**best_params_lr_w2v)
cv_scores_tuned_lr_w2v = cross_val_score(tuned_model_lr_w2v, X_train_w2v_vect_avg, y_train_w2v, cv=5, scoring='accuracy')

tuned_model_lr_w2v.fit(X_train_w2v_vect_avg, y_train_w2v)
y_pred_lr_val_w2v = tuned_model_lr_w2v.predict(X_val_w2v_vect_avg)

accuracy_lr_val_w2v = accuracy_score(y_val_w2v, y_pred_lr_val_w2v)
precision_lr_val_w2v = precision_score(y_val_w2v, y_pred_lr_val_w2v)
recall_lr_val_w2v = recall_score(y_val_w2v, y_pred_lr_val_w2v)

F1_lr_val_w2v = 2 * (precision_lr_val_w2v * recall_lr_val_w2v) / (precision_lr_val_w2v + recall_lr_val_w2v)

print("Cross Validation Scores (for tuned model): ",cv_scores_tuned_lr_w2v)
print("F1: ",F1_lr_val_w2v)
print("Accuracy: ",accuracy_lr_val_w2v)

Cross Validation Scores (for tuned model):  [0.70588235 0.94117647 0.94117647 0.9375     0.875     ]
F1:  1.0
Accuracy:  1.0


####Evaluating Model on Test Set

In [57]:
y_pred_lr_test_w2v = tuned_model_lr_w2v.predict(X_test_w2v_vect_avg)

accuracy_lr_test_w2v = accuracy_score(y_test_w2v, y_pred_lr_test_w2v)
precision_lr_test_w2v = precision_score(y_test_w2v, y_pred_lr_test_w2v)
recall_lr_test_w2v = recall_score(y_test_w2v, y_pred_lr_test_w2v)

F1_lr_test_w2v = 2 * (precision_lr_test_w2v * recall_lr_test_w2v) / (precision_lr_test_w2v + recall_lr_test_w2v)
print("F1: ",F1_lr_test_w2v)
print("Accuracy: ",accuracy_lr_test_w2v)

F1:  1.0
Accuracy:  1.0


###SVM Model

####Model Training

In [58]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

svm_model = SVC(random_state=0)
cv_scores_svm_w2v = cross_val_score(svm_model, X_train_w2v_vect_avg, y_train_w2v, cv=5, scoring='accuracy')
svm_model.fit(X_train_w2v_vect_avg, y_train_w2v)

print("Cross Validation Scores:", cv_scores_svm_w2v)

Cross Validation Scores: [0.88235294 1.         1.         0.9375     0.75      ]


####Hyperparameter Tuning

In [59]:
from sklearn.model_selection import GridSearchCV

C = [0.1, 1, 10]
gamma = ['scale', 'auto', 1, 0.1, 0.01]
kernel = ['rbf', 'poly', 'sigmoid']
random_state = [0]

param_grid_svm = {'C': C,
                  'gamma': gamma,
                  'kernel': kernel,
                  'random_state': random_state}
base_model_svm = SVC()
grid_search_svm = GridSearchCV(estimator=base_model_svm, param_grid=param_grid_svm, cv=5, scoring='accuracy')

grid_search_svm.fit(X_train_w2v_vect_avg, y_train_w2v)
best_params_svm_w2v = grid_search_svm.best_params_
print("best parameters: ",best_params_svm_w2v)

best parameters:  {'C': 1, 'gamma': 'scale', 'kernel': 'rbf', 'random_state': 0}


####Evaluating Model on Validation Set

In [60]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

tuned_model_svm_w2v = SVC(**best_params_svm_w2v)
cv_scores_tuned_svm_w2v = cross_val_score(tuned_model_svm_w2v, X_train_w2v_vect_avg, y_train_w2v, cv=5, scoring='accuracy')
tuned_model_svm_w2v.fit(X_train_w2v_vect_avg, y_train_w2v)
y_pred_svm_val_w2v = tuned_model_svm_w2v.predict(X_val_w2v_vect_avg)

accuracy_svm_val_w2v = accuracy_score(y_val_w2v, y_pred_svm_val_w2v)
precision_svm_val_w2v = precision_score(y_val_w2v, y_pred_svm_val_w2v)
recall_svm_val_w2v = recall_score(y_val_w2v, y_pred_svm_val_w2v)

F1_svm_val_w2v = 2 * (precision_svm_val_w2v * recall_svm_val_w2v) / (precision_svm_val_w2v + recall_svm_val_w2v)

print("Cross Validation Scores (for tuned model): ",cv_scores_tuned_svm_w2v)
print("F1: ",F1_svm_val_w2v)
print("Accuracy: ",accuracy_svm_val_w2v)

Cross Validation Scores (for tuned model):  [0.88235294 1.         1.         0.9375     0.75      ]
F1:  1.0
Accuracy:  1.0


####Evaluating Model on Test Set

In [61]:
y_pred_svm_test_w2v = tuned_model_svm_w2v.predict(X_test_w2v_vect_avg)

accuracy_svm_test_w2v = accuracy_score(y_test_w2v, y_pred_svm_test_w2v)
precision_svm_test_w2v = precision_score(y_test_w2v, y_pred_svm_test_w2v)
recall_svm_test_w2v = recall_score(y_test_w2v, y_pred_svm_test_w2v)

F1_svm_test_w2v = 2 * (precision_svm_test_w2v * recall_svm_test_w2v) / (precision_svm_test_w2v + recall_svm_test_w2v)
print("F1: ",F1_svm_test_w2v)
print("Accuracy: ",accuracy_svm_test_w2v)

F1:  1.0
Accuracy:  1.0


###Naive Bayes

####Model Training

In [62]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

nb_model = GaussianNB()
cv_scores_nb_w2v = cross_val_score(nb_model, X_train_w2v_vect_avg, y_train_w2v, cv=5, scoring='accuracy')
nb_model.fit(X_train_w2v_vect_avg, y_train_w2v)

print("Cross Validation Scores:", cv_scores_nb_w2v)

Cross Validation Scores: [0.82352941 1.         0.94117647 0.8125     0.75      ]


####Hyperparameter Tuning

In [63]:
from sklearn.model_selection import GridSearchCV

var_smoothing = np.logspace(0,-9, num=100)

param_grid_nb = {'var_smoothing': var_smoothing}
base_model_nb = GaussianNB()
grid_search_nb = GridSearchCV(estimator=base_model_nb, param_grid=param_grid_nb, cv=5, scoring='accuracy')

grid_search_nb.fit(X_train_w2v_vect_avg, y_train_w2v)
best_params_nb_w2v = grid_search_nb.best_params_
print("best parameters: ",best_params_nb_w2v)

best parameters:  {'var_smoothing': 0.01873817422860384}


####Evaluating Model on Validation Set

In [64]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

tuned_model_nb_w2v = GaussianNB(**best_params_nb_w2v)
cv_scores_tuned_nb_w2v = cross_val_score(tuned_model_nb_w2v, X_train_w2v_vect_avg, y_train_w2v, cv=5, scoring='accuracy')
tuned_model_nb_w2v.fit(X_train_w2v_vect_avg, y_train_w2v)
y_pred_nb_val_w2v = tuned_model_nb_w2v.predict(X_val_w2v_vect_avg)

accuracy_nb_val_w2v = accuracy_score(y_val_w2v, y_pred_nb_val_w2v)
precision_nb_val_w2v = precision_score(y_val_w2v, y_pred_nb_val_w2v)
recall_nb_val_w2v = recall_score(y_val_w2v, y_pred_nb_val_w2v)

F1_nb_val_w2v = 2 * (precision_nb_val_w2v * recall_nb_val_w2v) / (precision_nb_val_w2v + recall_nb_val_w2v)

print("Cross Validation Scores (for tuned model): ",cv_scores_tuned_nb_w2v)
print("F1: ",F1_nb_val_w2v)
print("Accuracy: ",accuracy_nb_val_w2v)

Cross Validation Scores (for tuned model):  [0.76470588 1.         0.94117647 0.875      0.75      ]
F1:  1.0
Accuracy:  1.0


####Evaluating Model on Test Set

In [65]:
y_pred_nb_test_w2v = tuned_model_nb_w2v.predict(X_test_w2v_vect_avg)

accuracy_nb_test_w2v = accuracy_score(y_test_w2v, y_pred_nb_test_w2v)
precision_nb_test_w2v = precision_score(y_test_w2v, y_pred_nb_test_w2v)
recall_nb_test_w2v = recall_score(y_test_w2v, y_pred_nb_test_w2v)

F1_nb_test_w2v = 2 * (precision_nb_test_w2v * recall_nb_test_w2v) / (precision_nb_test_w2v + recall_nb_test_w2v)
print("F1: ",F1_nb_test_w2v)
print("Accuracy: ",accuracy_nb_test_w2v)

F1:  1.0
Accuracy:  1.0
