In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import string

In [4]:
from google.colab import files
upload=files.upload()

Saving sentimentdataset (Project 1).csv to sentimentdataset (Project 1).csv


In [5]:
nlp = spacy.load('en_core_web_sm')
stopwords = list(STOP_WORDS)
stopwords.remove('not')


In [6]:
data = pd.read_csv('sentimentdataset (Project 1).csv')
data = data.drop(columns=['ID', 'Source'])

In [7]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)

    tokens = []  # list of tokens
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)

    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in string.punctuation:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [20]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)

    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in string.punctuation:
            cleaned_tokens.append(token)
    return ' '.join(cleaned_tokens)  # Join tokens into a single string


In [21]:
X = data['Message']
y = data['Target']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [23]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=text_data_cleaning)),
    ('clf', LinearSVC()),
])

In [24]:
param_grid = {
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__C': [0.1, 1, 10],
}

In [25]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

grid_search.fit(X_train, y_train)




In [26]:
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'clf__C': 1, 'tfidf__max_df': 1.0, 'tfidf__ngram_range': (1, 2)}


In [27]:
y_pred = grid_search.predict(X_test)

In [28]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

[[194  71]
 [ 65 219]]
              precision    recall  f1-score   support

           0       0.75      0.73      0.74       265
           1       0.76      0.77      0.76       284

    accuracy                           0.75       549
   macro avg       0.75      0.75      0.75       549
weighted avg       0.75      0.75      0.75       549

Accuracy: 0.7522768670309654


In [29]:
import joblib
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'linear_svm_best_model.joblib')

['linear_svm_best_model.joblib']

In [30]:
loaded_model = joblib.load('linear_svm_best_model.joblib')

new_data = ["it is very good", 'it is bad', 'awesome', 'I am not comfortable with that']
predictions = loaded_model.predict(new_data)

print(predictions)

[1 0 1 1]


In [32]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
# Your previous code for data loading, text cleaning function, and train/test splitting...

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=text_data_cleaning)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Create an MLPClassifier model
clf = MLPClassifier(hidden_layer_sizes=(100,), learning_rate_init=0.001, batch_size=32, max_iter=200, random_state=42)

# Train the MLPClassifier model using the transformed training data
clf.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_tfidf)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Accuracy: 0.65

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.61      0.63       265
           1       0.66      0.69      0.67       284

    accuracy                           0.65       549
   macro avg       0.65      0.65      0.65       549
weighted avg       0.65      0.65      0.65       549





In [34]:

# Fit the GridSearchCV to find the best parameters
grid_search.fit(X_train, y_train)

# Obtain the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model's performance
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Save the best LinearSVC model
joblib.dump(best_model, 'linear_svm_best_model.joblib')

# Load the saved LinearSVC model
loaded_model = joblib.load('linear_svm_best_model.joblib')

# New unlabeled data for prediction
new_data = ["it is very good", 'it is bad', 'awesome', 'I am not comfortable with that']

# Preprocess the new data similarly as before
new_data_processed = [text_data_cleaning(sentence) for sentence in new_data]

# Use the loaded model for predictions on new data
predictions = loaded_model.predict(new_data_processed)
print("Predictions on New Data:", predictions)




Best Parameters: {'clf__C': 1, 'tfidf__max_df': 1.0, 'tfidf__ngram_range': (1, 2)}
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.73      0.74       265
           1       0.76      0.77      0.76       284

    accuracy                           0.75       549
   macro avg       0.75      0.75      0.75       549
weighted avg       0.75      0.75      0.75       549

Accuracy: 0.7522768670309654
Predictions on New Data: [1 0 1 1]
