In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import string

In [2]:
nlp = spacy.load('en_core_web_sm')
stopwords = list(STOP_WORDS)
stopwords.remove('not')


In [4]:
data = pd.read_csv('sentimentdataset (Project 1).csv')
data = data.drop(columns=['ID', 'Source'])

In [5]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)

    tokens = []  # list of tokens
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)

    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in string.punctuation:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [6]:
X = data['Message']
y = data['Target']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [8]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=text_data_cleaning)),
    ('clf', LinearSVC()),
])

In [9]:
param_grid = {
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__C': [0.1, 1, 10],
}

In [11]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

grid_search.fit(X_train, y_train)




In [12]:
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'clf__C': 1, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}


In [13]:
y_pred = grid_search.predict(X_test)

In [14]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

[[220  45]
 [ 42 242]]
              precision    recall  f1-score   support

           0       0.84      0.83      0.83       265
           1       0.84      0.85      0.85       284

    accuracy                           0.84       549
   macro avg       0.84      0.84      0.84       549
weighted avg       0.84      0.84      0.84       549

Accuracy: 0.8415300546448088


In [16]:
import joblib
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'linear_svm_best_model.joblib')

['linear_svm_best_model.joblib']

In [17]:
loaded_model = joblib.load('linear_svm_best_model.joblib')

new_data = ["it is very good", 'it is bad', 'awesome', 'I am not comfortable with that']
predictions = loaded_model.predict(new_data)

print(predictions)

[1 0 1 0]
