In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
# Given corpus
file_path = 'corpus.txt'
corpus=[]
labels = []
try:
    with open(file_path, 'r') as file:
        for line in file:
            corpus.append( line.strip())
except FileNotFoundError:
    print(f"File '{file_path}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

file_path = 'labels.txt'
try:
    with open(file_path, 'r') as file:
        for line in file:
            labels.append( line.strip())
except FileNotFoundError:
    print(f"File '{file_path}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
label_encoding_dict = dict(zip(labels, encoded_labels))

# Print the dictionary
print("Label Encoding Dictionary:")
print(label_encoding_dict)
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Support Vector Machine (SVM) classifier with RBF kernel
svc = SVC()

# Create a pipeline with TF-IDF and SVM
pipeline = make_pipeline(tfidf_vectorizer, svc)

# Parameter grid for grid search
param_grid = {
    'svc__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'svc__gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'svc__kernel': ['rbf', 'linear', 'poly'],  # Added 'poly' kernel
    'svc__degree': [2, 3, 4],  # Added 'degree' for 'poly' kernel
}


grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy', refit=True)
grid_search.fit(corpus, labels)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

Label Encoding Dictionary:
{'sadness': 4, 'joy': 2, 'love': 3, 'anger': 0, 'fear': 1, 'surprise': 5}
Best parameters found:  {'svc__C': 10, 'svc__degree': 2, 'svc__gamma': 0.1, 'svc__kernel': 'rbf'}
Best cross-validation score: 0.75


In [17]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

# Support Vector Machine (SVM) classifier with RBF kernel
svc = SVC(C=grid_search.best_params_['svc__C'],
          gamma=grid_search.best_params_['svc__gamma'],
          kernel=grid_search.best_params_['svc__kernel'],
          degree=grid_search.best_params_['svc__degree'])

# Fit the TF-IDF vectorizer and the SVM with best parameters
X_tfidf_svc = svc.fit(X_tfidf, labels)

# Pickle both components
with open('tfidf_svc_model.pkl', 'wb') as file:
    pickle.dump((label_encoder, tfidf_vectorizer, svc), file)

In [16]:
grid_search