In [None]:
# Libraries
import pandas as pd  
import numpy as np  
import os, re
import string
import nltk
from nltk.corpus import stopwords  
from bs4 import BeautifulSoup
from wordcloud import WordCloud, STOPWORDS  
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB  # Import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline  


# Download NLTK stopwords
nltk.download('stopwords')  

In [None]:
# Current working directory
print('Current working directory: ', os.getcwd())

In [None]:
# Loading data
train = pd.read_csv('labeledTrainData.tsv', delimiter='\t')
test = pd.read_csv('testData.tsv', delimiter='\t')
print(train.head())
print(test.head())

In [None]:
# Print sentiment counts
print("Number of rows for sentiment 1: {}".format(len(train[train.sentiment == 1])))
print("Number of rows for sentiment 0: {}".format(len(train[train.sentiment == 0])))
print(train.groupby('sentiment').describe().transpose())

In [None]:
# Create new columns for review length
train['length'] = train['review'].apply(len)
print(train.head())

In [None]:
# Histogram of review length
train['length'].plot.hist(bins=100)
plt.title('Histogram of Review Length')
plt.xlabel('Length')
plt.show()

In [None]:
# Check specific review lengths
print(train[train['length'] == 13708]['review'].iloc[0])

In [10]:
# Text preprocessing
def clean_text(raw_text):
    # 1. Remove HTML tags
    raw_text = BeautifulSoup(raw_text, "html.parser").get_text()
    
    # 2. Remove all non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", raw_text)
    
    # 3. Convert to lowercase, split into words
    words = letters_only.lower().split()
    
    # 4. Remove stopwords
    stops = set(stopwords.words("english"))
    return [w for w in words if w not in stops]


In [None]:
# Clean review text and add new columns
train['clean_review'] = train['review'].apply(clean_text)
train['length_clean_review'] = train['clean_review'].apply(len)
print(train.head())

In [None]:
# Plot word cloud
word_cloud = WordCloud(width=1000, height=500, stopwords=STOPWORDS, background_color='blue').generate(' '.join(train['review']))
plt.figure(figsize=(15, 8))
plt.imshow(word_cloud)
plt.axis('off')
plt.show()


In [None]:
# Create Bag of Words
bow_transform = CountVectorizer(analyzer=clean_text)
X_train_counts = bow_transform.fit_transform(train['review'])

In [None]:
# Print details about the Bag of Words
print(f"Total number of vocab words: {len(bow_transform.vocabulary_)}")
print(bow_transform.get_feature_names_out()[71821])
print(bow_transform.get_feature_names_out()[72911])

In [None]:
# Create TF-IDF matrix
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf)

In [16]:
# Example for TF-IDF with vocabulary
texts = ["This is a sample document about war.", "This document is about a book.", "Another document discussing war and book."]
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(texts)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [17]:
# Check vocabulary and IDF values
vocabulary = count_vectorizer.vocabulary_
word_index_war = vocabulary.get('war')
word_index_book = vocabulary.get('book')

In [None]:
if word_index_war is not None and word_index_book is not None:
    print(tfidf_transformer.idf_[word_index_war])
    print(tfidf_transformer.idf_[word_index_book])
else:
    print("One or both words are not in the vocabulary")

In [None]:
# Transform new reviews using TF-IDF
new_reviews = ["New review about war", "Another book review"]
new_reviews_bow = count_vectorizer.transform(new_reviews)
new_reviews_tfidf = tfidf_transformer.transform(new_reviews_bow)
print(new_reviews_tfidf.shape)

In [20]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(train['review'], train['sentiment'], test_size=0.22, random_state=101)

In [21]:
# Define prediction function
def pred(predicted, compare):
    cm = pd.crosstab(compare, predicted, rownames=['Actual'], colnames=['Predicted'])
    TN = cm.iloc[0, 0]
    FP = cm.iloc[0, 1]
    FN = cm.iloc[1, 0]
    TP = cm.iloc[1, 1]
    
    print("CONFUSION MATRIX ------- >> ")
    print(cm)
    print()
    
    # Check accuracy of model
    accuracy = (TP + TN) / (TP + TN + FP + FN) * 100
    print('Accuracy: ', round(accuracy, 2))
    print()
    
    # False Negative Rate
    false_negative_rate = (FN * 100) / (FN + TP)
    print('False Negative Rate: ', round(false_negative_rate, 2))
    print()
    
    # False Positive Rate
    false_positive_rate = (FP * 100) / (FP + TN)
    print('False Positive Rate: ', round(false_positive_rate, 2))
    print()
    
    # Print classification report
    print('Classification Report: ')
    print(classification_report(compare, predicted))

In [None]:
# Training models
# Logistic Regression
pipeline_logit = Pipeline([
    ('bow', CountVectorizer(analyzer=clean_text)),
    ('tfidf', TfidfTransformer()),
    ('classifier', LogisticRegression(random_state=101))
])
pipeline_logit.fit(X_train, y_train)
predictions = pipeline_logit.predict(X_train)
pred(predictions, y_train)

In [None]:
# Test Set Result
predictions = pipeline_logit.predict(X_test)
pred(predictions, y_test)


In [None]:
# Naive Bayes Model
pipeline_nb = Pipeline([
    ('bow', CountVectorizer(analyzer=clean_text)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())  # Use MultinomialNB here
])
pipeline_nb.fit(X_train, y_train)
predictions = pipeline_nb.predict(X_train)
pred(predictions, y_train)

In [None]:
# Result on Test Case
predictions = pipeline_nb.predict(X_test)
pred(predictions, y_test)

In [None]:
# Random Forest Model
pipeline_rf = Pipeline([
    ('bow', CountVectorizer(analyzer=clean_text)),
    ('tfidf', TfidfTransformer()),
    ('classifier', RandomForestClassifier(n_estimators=500))
])
pipeline_rf.fit(X_train, y_train)
predictions = pipeline_rf.predict(X_train)
pred(predictions, y_train)

In [None]:
# Test Set Result
predictions = pipeline_rf.predict(X_test)
pred(predictions, y_test)

In [None]:
# Final Model: Logistic Regression
pipeline_logit_final = Pipeline([
    ('bow', CountVectorizer(analyzer=clean_text)),
    ('tfidf', TfidfTransformer()),
    ('classifier', LogisticRegression(random_state=101))
])
pipeline_logit_final.fit(train['review'], train['sentiment'])
test['sentiment'] = pipeline_logit_final.predict(test['review'])
test.head(5)
output = test[['id', 'sentiment']]
print(output)
output.to_csv("output.csv", index=False, quoting=3)