# Data Collection and Preprocessing

In [8]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# Read the data with the correct encoding
data = pd.read_csv('sentiment_dataset.csv', encoding='latin1')  # replace with the correct encoding

# Fill missing values with an empty string
data['text'].fillna('', inplace=True)

# Convert all entries to strings
data['text'] = data['text'].astype(str)

# Text Cleaning Function
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text

data['cleaned_text'] = data['text'].apply(preprocess_text)

# Split data
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['sentiment'], test_size=0.2, random_state=42)

#


#Feature Extraction

Bag-of-Words


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=5000)
X_train_bow = vectorizer.fit_transform(X_train).toarray()
X_test_bow = vectorizer.transform(X_test).toarray()


#Model Training

Bag-of-Words with Logistic Regression

In [14]:
# Ensure there are no NaNs in the split data
X_train = X_train.fillna('')
X_test = X_test.fillna('')
y_train = y_train.fillna(data['sentiment'].mode()[0])
y_test = y_test.fillna(data['sentiment'].mode()[0])

# Vectorizer (Bag-of-Words)
vectorizer = CountVectorizer(max_features=5000)
X_train_bow = vectorizer.fit_transform(X_train).toarray()
X_test_bow = vectorizer.transform(X_test).toarray()

# Ensure there are no NaNs in the vectorized data
X_train_bow = np.nan_to_num(X_train_bow)
X_test_bow = np.nan_to_num(X_test_bow)

# Model (Logistic Regression)
model_bow = LogisticRegression()
model_bow.fit(X_train_bow, y_train)

y_pred_bow = model_bow.predict(X_test_bow)


In [19]:
print("Bag-of-Words Accuracy:", accuracy_score(y_test, y_pred_bow))
print("Classification Report:\n", classification_report(y_test, y_pred_bow))

Bag-of-Words Accuracy: 0.6998961578400831
Classification Report:
               precision    recall  f1-score   support

    negative       0.77      0.37      0.50       226
     neutral       0.69      0.89      0.78       526
    positive       0.70      0.58      0.63       211

    accuracy                           0.70       963
   macro avg       0.72      0.61      0.64       963
weighted avg       0.71      0.70      0.68       963



Use the Model to Predict Sentiment of New Examples

In [22]:
# Function to preprocess a new example
def preprocess_and_vectorize(text, vectorizer):
    cleaned_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([cleaned_text]).toarray()
    return vectorized_text

# Example texts from the dataset to analyze sentiment
example_texts = [
    "I love this product! It works really well.",
    "This is the worst service I have ever received.",
    "The product is okay, nothing special."
]

# Predict the sentiment of the example texts
for text in example_texts:
    vectorized_text = preprocess_and_vectorize(text, vectorizer)
    sentiment = model_bow.predict(vectorized_text)
    print(f"Text: {text}\nPredicted Sentiment: {sentiment[0]}\n")

# Predict the sentiment of some texts from the dataset
for i in range(5):
    text = data['text'].iloc[i]
    cleaned_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([cleaned_text]).toarray()
    sentiment = model_bow.predict(vectorized_text)
    print(f"Text: {text}\nPredicted Sentiment: {sentiment[0]}\n")


Text: I love this product! It works really well.
Predicted Sentiment: positive

Text: This is the worst service I have ever received.
Predicted Sentiment: neutral

Text: The product is okay, nothing special.
Predicted Sentiment: neutral

Text: Last session of the day  http://twitpic.com/67ezh
Predicted Sentiment: neutral

Text:  Shanghai is also really exciting (precisely -- skyscrapers galore). Good tweeps in China:  (SH)  (BJ).
Predicted Sentiment: positive

Text: Recession hit Veronique Branquinho, she has to quit her company, such a shame!
Predicted Sentiment: negative

Text:  happy bday!
Predicted Sentiment: positive

Text:  http://twitpic.com/4w75p - I like it!!
Predicted Sentiment: neutral

