In [1]:
import pandas as pd
df = pd.read_csv('sofmattress_train.csv')
X = df['sentence']
y = df['label']
df.head()

Unnamed: 0,sentence,label
0,You guys provide EMI option?,EMI
1,Do you offer Zero Percent EMI payment options?,EMI
2,0% EMI.,EMI
3,EMI,EMI
4,I want in installment,EMI


In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
with open("tfidf_vectorizer.pkl", "wb") as f2:
    pickle.dump(vectorizer, f2)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
log_reg = LogisticRegression()
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [1, 10, 50],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200]
}
grid = GridSearchCV(log_reg, param_grid=param_grid, cv=5)

In [5]:
#This is done to suppress useless warnings that may clutter the output.
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [6]:
grid.fit(X_train_vec, y_train)
print("Best params:", grid.best_params_)
print("Best score (CV):", grid.best_score_)



Best params: {'C': 50, 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'}
Best score (CV): 0.8015965166908563


In [7]:
from sklearn.metrics import accuracy_score
y_pred = grid.predict(X_test_vec)
test_score = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_score)

Test Accuracy: 0.8181818181818182


In [8]:
with open("logistic_model.pkl", "wb") as f1:
    pickle.dump(grid.best_estimator_, f1)