In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

# Load the data
# Load the data
df = pd.read_json("output.json")
df.dropna(inplace = True)
df.head()


Unnamed: 0,interview_title,date,offer_acceptance,experience_rating,interview_rating,interview_process
30,Customer Success Manager Interview,2021-09-10,No Offer,Positive Experience,Average Interview,"First interview completed so far, haven't been..."
31,Sr. Software engineering Manager Interview,2023-04-26,Accepted Offer,Positive Experience,Average Interview,"- 30 min chat with recruiter, both decided to ..."
32,Software Engineer Interview,2023-04-25,No Offer,Positive Experience,Average Interview,Great process - I had a coding interview and a...
33,Product Manager Interview,2023-04-25,No Offer,Positive Experience,Average Interview,Fair and reasonable. Focus on accessibility. D...
34,Executive Assistant Interview,2023-04-25,Accepted Offer,Neutral Experience,Average Interview,Interview process was long but worth it. I bel...


In [5]:
# Create a new column called 'label' based on the 'received_offer' column
df['label'] = df['offer_acceptance'].map({'Accepted Offer': 1, 'No Offer': 0, 'Declined Offer': 0})

# Remove rows with missing values
df.dropna(inplace=True)

# Preview the preprocessed data
df.head()


Unnamed: 0,interview_title,date,offer_acceptance,experience_rating,interview_rating,interview_process,label
30,Customer Success Manager Interview,2021-09-10,No Offer,Positive Experience,Average Interview,"First interview completed so far, haven't been...",0
31,Sr. Software engineering Manager Interview,2023-04-26,Accepted Offer,Positive Experience,Average Interview,"- 30 min chat with recruiter, both decided to ...",1
32,Software Engineer Interview,2023-04-25,No Offer,Positive Experience,Average Interview,Great process - I had a coding interview and a...,0
33,Product Manager Interview,2023-04-25,No Offer,Positive Experience,Average Interview,Fair and reasonable. Focus on accessibility. D...,0
34,Executive Assistant Interview,2023-04-25,Accepted Offer,Neutral Experience,Average Interview,Interview process was long but worth it. I bel...,1


In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[['interview_process', 'experience_rating', 'interview_rating']], df['label'], test_size=0.3, random_state=42)


In [6]:
# Define the models to try
models = [
    ('Logistic Regression', Pipeline([('vectorizer', CountVectorizer()), ('classifier', LogisticRegression())])),
    ('Naive Bayes', Pipeline([('vectorizer', CountVectorizer()), ('classifier', MultinomialNB())])),
    ('Linear SVM', Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', LinearSVC())])),
    ('Random Forest', Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', RandomForestClassifier())]))
]

# Evaluate each model
for name, model in models:
    print(name)
    model.fit(X_train['interview_process'], y_train)
    y_pred = model.predict(X_test['interview_process'])
    print(classification_report(y_test, y_pred))


Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.76      0.79      0.77      1930
           1       0.54      0.50      0.52       961

    accuracy                           0.69      2891
   macro avg       0.65      0.64      0.65      2891
weighted avg       0.69      0.69      0.69      2891

Naive Bayes
              precision    recall  f1-score   support

           0       0.78      0.75      0.76      1930
           1       0.53      0.57      0.55       961

    accuracy                           0.69      2891
   macro avg       0.65      0.66      0.66      2891
weighted avg       0.70      0.69      0.69      2891

Linear SVM
              precision    recall  f1-score   support

           0       0.76      0.80      0.78      1930
           1       0.55      0.49      0.52       961

    accuracy                           0.70      2891
   macro avg       0.66      0.65      0.65      2891
weighted avg       0.69      0.70      0.70      28

In [7]:
# Define the parameter grid to search over
param_grid = {
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'classifier__C': [0.1, 1, 10]
}

# Perform a grid search over the parameter grid
grid_search = GridSearchCV(models[2][1], param_grid=param_grid, cv=5)
grid_search.fit(X_train['interview_process'], y_train)

# Print the best parameters and score
print(grid_search.best_params_)
print(grid_search.best_score_)

# Test the final model on the testing set
y_pred = grid_search.predict(X_test['interview_process'])
print(classification_report(y_test, y_pred))



{'classifier__C': 0.1, 'vectorizer__ngram_range': (1, 1)}
0.6956829215178624
              precision    recall  f1-score   support

           0       0.74      0.88      0.80      1930
           1       0.61      0.37      0.46       961

    accuracy                           0.71      2891
   macro avg       0.67      0.63      0.63      2891
weighted avg       0.69      0.71      0.69      2891

