In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

df = pd.read_json("output.json")
df.dropna(inplace = True)
df.head()



Unnamed: 0,interview_title,date,offer_acceptance,experience_rating,interview_rating,interview_process
30,Customer Success Manager Interview,2021-09-10,No Offer,Positive Experience,Average Interview,"First interview completed so far, haven't been..."
31,Sr. Software engineering Manager Interview,2023-04-26,Accepted Offer,Positive Experience,Average Interview,"- 30 min chat with recruiter, both decided to ..."
32,Software Engineer Interview,2023-04-25,No Offer,Positive Experience,Average Interview,Great process - I had a coding interview and a...
33,Product Manager Interview,2023-04-25,No Offer,Positive Experience,Average Interview,Fair and reasonable. Focus on accessibility. D...
34,Executive Assistant Interview,2023-04-25,Accepted Offer,Neutral Experience,Average Interview,Interview process was long but worth it. I bel...


In [3]:
df['offer_acceptance'].value_counts()

No Offer          5598
Accepted Offer    3302
Declined Offer     768
Name: offer_acceptance, dtype: int64

In [4]:
df['experience_rating'].value_counts()

Positive Experience    6424
Neutral Experience     1819
Negative Experience    1385
Average Interview        18
Difficult Interview      12
Easy Interview           10
Name: experience_rating, dtype: int64

In [5]:
df['interview_rating'].value_counts()

Average Interview      5669
Difficult Interview    2369
Easy Interview         1630
Name: interview_rating, dtype: int64

In [None]:
I have a dataset named df.csv that has has following columns:
    
1. offer_acceptance: it has three values: No Offer, Accepted Offer, Declined Offer
2. experience_rating: it has 6 values: Positive Experience, Neutral Experience, Negative Experience, Average Interview, Difficult Interview, Easy Interview
3. interview_rating: It has 3 values: Average Interview, Difficult Interview, Easy Interview
4. interview_title: It has interview title in text
5. interview_process: It has interview process details descriped in text
    
    

Use Python to train following classifiers:
    - Logistic Regression
    - Decision Tree classifier
    - Gradient boosting classifier
    - XGBoost classifier
    - RandomForest classifier
    - SVM classifier
    - Naive Bayes classifier

Make sure to try both CountVectorizer, TfidfVectorizer separately.
Make sure to Use all columns  
Make sure to use gridsearch and do hypertuning with exactly 3 parameters with 3-5 values.

Evaluate the models on 70% training and 30% test.

Print accuracy of all models together and plot them and compare them.


In [2]:
X = df['interview_title']
y = df['offer_acceptance']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [3]:
def train_model(vectorizer, classifier):
    # Vectorize the text data
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train the model
    classifier.fit(X_train_vec, y_train)

    # Make predictions on the testing data
    y_pred = classifier.predict(X_test_vec)

    # Calculate the accuracy score
    acc_score = accuracy_score(y_test, y_pred)

    return acc_score



In [4]:
# Multinomial Naive Bayes
nb = MultinomialNB()
nb_params = {'alpha': [0.1, 0.5, 1]}
nb_grid = GridSearchCV(nb, nb_params, cv=5)
nb_acc_score = train_model(CountVectorizer(), nb_grid)
print("Multinomial Naive Bayes Accuracy:", nb_acc_score)



Multinomial Naive Bayes Accuracy: 0.5811138014527845


In [5]:
# Logistic Regression
lr = LogisticRegression(max_iter=10000)
lr_params = {'C': [0.1, 1, 10]}
lr_grid = GridSearchCV(lr, lr_params, cv=5)
lr_acc_score = train_model(CountVectorizer(), lr_grid)
print("Logistic Regression Accuracy:", lr_acc_score)



Logistic Regression Accuracy: 0.5828433068142511


In [10]:
# Support Vector Machines (SVM)
svm = SVC()
svm_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svm_grid = GridSearchCV(svm, svm_params, cv=5)
svm_acc_score = train_model(TfidfVectorizer(), svm_grid)
print("SVM Accuracy:", svm_acc_score)



SVM Accuracy: 0.5873400207540643


In [11]:
# Random Forest
rf = RandomForestClassifier()
rf_params = {'n_estimators': [100, 200, 300]}
rf_grid = GridSearchCV(rf, rf_params, cv=5)
rf_acc_score = train_model(CountVectorizer(), rf_grid)
print("Random Forest Accuracy:", rf_acc_score)



Random Forest Accuracy: 0.5728121757177447


In [12]:
# Train the final model
final_vectorizer = TfidfVectorizer()
final_classifier = SVC(C=10, kernel='linear')
X_vec = final_vectorizer.fit_transform(X)
final_classifier.fit(X_vec, y)



SVC(C=10, kernel='linear')

In [13]:
# Make predictions on new data
new_data = ['I had a great interview and received an offer!', 'Unfortunately, I did not receive an offer after my interview.']
new_data_vec = final_vectorizer.transform(new_data)
new_data_pred = final_classifier.predict(new_data_vec)
print(new_data_pred)

['No Offer' 'No Offer']
