# Loading some libraries

In [1]:
import numpy as np
import pandas as pd

# Reading the dataset

In [3]:
dt=pd.read_csv('labeledTrainData.tsv',header=0, delimiter="\t", quoting=3)

In [4]:
dt.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [5]:
dt.describe

<bound method NDFrame.describe of               id  sentiment                                             review
0       "5814_8"          1  "With all this stuff going down at the moment ...
1       "2381_9"          1  "\"The Classic War of the Worlds\" by Timothy ...
2       "7759_3"          0  "The film starts with a manager (Nicholas Bell...
3       "3630_4"          0  "It must be assumed that those who praised thi...
4       "9495_8"          1  "Superbly trashy and wondrously unpretentious ...
...          ...        ...                                                ...
24995   "3453_3"          0  "It seems like more consideration has gone int...
24996   "5064_1"          0  "I don't believe they made this film. Complete...
24997  "10905_3"          0  "Guy is a loser. Can't get girls, needs to bui...
24998  "10194_3"          0  "This 30 minute documentary Buñuel made in the...
24999   "8478_8"          1  "I saw this movie as a child and it broke my h...

[25000 rows x 3 c

In [7]:
reviews = dt['review']
sentiments = dt['sentiment']


# **Loading Libraries for Preprocessing**


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Data preprocessing
### Data preprocessing involves cleaning and transforming the raw text data into a format suitable for analysis. We apply text normalization techniques such as converting text to lowercase, removing HTML tags, special characters, and punctuation, tokenizing the text, removing stopwords, and performing stemming. Create a function preprocess_text that will be applied to each review text. This function will convert text to lowercase, remove HTML tags, special characters, and punctuation, tokenize the text, remove stopwords, and perform stemming.



In [15]:
# Data Preprocessing
def preprocess_text(text):

    # Convert to lowercase

    text = text.lower()

    # Remove HTML tags if any

    text = re.sub(r'<.*?>', '', text)

    # Remove special characters and punctuation

    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization

    words = nltk.word_tokenize(text)

    # Stopword Removal

    stop_words = set(stopwords.words('english'))

    filtered_words = [word for word in words if word not in stop_words]

    # Stemming

    stemmer = PorterStemmer()

    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    return ' '.join(stemmed_words)

reviews = reviews.apply(preprocess_text)

# Splitting dataset to training and testing

In [16]:
X_train, X_test, y_train, y_test = train_test_split(reviews, sentiments, test_size=0.2, random_state=42)

# Feature Extraction
Feature extraction involves converting the preprocessed text into a numerical representation that machine learning algorithms can understand. Here, we use the Bag of Words (BoW) approach to create a matrix where each row represents a document (review) and each column represents a word, with values indicating word frequencies.

In [17]:
vectorizer = CountVectorizer(max_features=5000)  # Limit vocabulary to top 5000 words
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Model Building
In this step, we build a sentiment analysis model using Logistic Regression, a popular classification algorithm. We fit the model to the training data, allowing it to learn the relationships between the input features (BoW representation of reviews) and the target variable (sentiments).

In [18]:
model = LogisticRegression()
model.fit(X_train_bow, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Prediction
After training the model, we use it to make predictions on the test data. We calculate the accuracy of the model by comparing its predictions to the true sentiments. Additionally, we generate a classification report that provides precision, recall, and F1-score for each class (positive and negative sentiment) and overall.

In [19]:
y_pred = model.predict(X_test_bow)

#  Model Evaluation

In [20]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [23]:
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8518
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      2481
           1       0.85      0.86      0.85      2519

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000





*    **Accuracy**:-The accuracy is 0.8518, which means that the model correctly classified approximately 85.18% of the total instances.
*   **Precision**:-For class 0 (label 0), the precision is 0.85, and for class 1 (label 1), it's also 0.85. This indicates that when the model predicts class 0, it's correct 85% of the time, and when it predicts class 1, it's correct 85% of the time.
*   **Recall**:-It's a measure of how well the model is capturing the positive instances. Here for class 0, the recall is 0.85, and for class 1, it's 0.86. This means that the model is able to identify 85% of actual class 0 instances and 86% of actual class 1 instances.
*   **F1-Score**:The F1-score is the weighted average of precision and recall.
The F1-score is approximately 0.85 for both classes.
*   **Macro Average**: The macro average calculates the average performance metric (precision, recall, F1-score) across all classes without considering class imbalance. Here the macro average for precision, recall, and F1-score is approximately 0.85.
*   Weighted average consider class imbalance too













# Example Predictions

In [24]:
print("\nExample Predictions:")
for i in range(5):
    review = X_test.iloc[i]  # Get the review text
    true_sentiment = y_test.iloc[i]
    predicted_sentiment = "Positive" if y_pred[i] == 1 else "Negative"
    print(f"Review: {review}")
    print(f"True Sentiment: {'Positive' if true_sentiment == 1 else 'Negative'}")
    print(f"Predicted Sentiment: {predicted_sentiment}\n")


Example Predictions:
Review: read there girl soup came peter sellerss low period watch movi im surpris almost noth happen movi seemingli presenc seller goldi hawn help movi doesnt whole movi seem like randomli film whatev happen without script anyth mayb havent seen everi movi middleag elderli peopl tri hippi one give movi pretti bad nameal seller hawn star much better movi dont wast time pretti worthless
True Sentiment: Negative
Predicted Sentiment: Negative

Review: film pull getgo grab attent acknowledg yeah stori open clich funeralin hand judi wouldnt given materi done great reunion famou pick one pleas team armi platoon theatr group singer bandbut movi never stoop cheap sentiment think go swoop anoth direct case point flower sent admir judith band member interest group ride clich one jail one found religion one alki one sunk dementia joie de vivr rediscov judi ignit granddaught interest carri us along make us overlook sometim simplist natur plotth cast who talent lesli caron inco