In [None]:
import pandas as pd

#file path to CSV
file_path = 'Reviews.csv'

df = pd.read_csv(file_path)  

df.to_csv('Reviews_full.csv', index=False)# Save the DataFrame to a new CSV file



In [None]:
print(df.head(5))  # Display the columns in the DataFrame+

In [None]:
print(df.columns)  # Display the columns in the DataFrame

In [None]:
#Dropping unnecessary columns 
columns_to_drop = ['Id', 'ProductId','UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator','Time']
df = df.drop(columns=columns_to_drop)

print(df.columns)  # Display the first 5 rows of the DataFrame after dropping columns

In [None]:

""" 
Loops dont work as expected in pandas 
since it uses vectorized operations 
so accessing and modifying a row in a loop is not efficient.
Instead, we can use the str.lower() method directly on the column.
and that will convert every row in column
"""
# Convert all text in 'Summary' and 'Text' columns to lowercase
df['Summary'] = df['Summary'].str.lower() 
df['Text'] = df['Text'].str.lower()  
# Convert each text to lowercase
    
print(df.head(5))  
# Display the first 5 rows of the DataFrame after converting to lowercase

In [None]:
import re
import string
"""
Got typed error: since the 'Summary' and 'Text' columns are not strings,
we need to convert them to strings before applying regex.
or use str.replace 
"""

df['Summary'] = df['Summary'].str.replace(f"[{re.escape(string.punctuation)}]", "", regex=True)
df['Text'] = df['Text'].str.replace(f"[{re.escape(string.punctuation)}]", "", regex=True)

print(df.head(5))  # Display the first 5 rows after removing punctuation



In [None]:
#Combine 'Summary' and 'Text' columns into a new column 

df['FullReview'] = df['Summary'].astype(str) + ' ' + df['Text'].astype(str)

#replace multiple whitespaces with a single whitespace
df['FullReview'] = df['FullReview'].str.replace(r'\s+', ' ', regex=True)  

#drop  Summary and Text columns
df = df.drop(columns=['Summary', 'Text']) 

print(df['FullReview'].head(5)) 
print(df.columns)  

In [None]:
"""
Stop words are words like 
a,the,and,that
they dont add meaning to text,so removing reduces noise 
"""
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')  # Download stopwords if not already downloaded
from nltk.tokenize import word_tokenize

stopwords = set(stopwords.words('english'))
def remove_stopwords(text):
    words = text.split()  # Split text into words
    filtered = [w for w in words if w not in stopwords]
    return ' '.join(filtered)  # Join words back into a string

# Apply the remove_stopwords function to the 'FullReview' column
df['FullReview'] = df['FullReview'].apply(remove_stopwords)
print(df['FullReview'].head(5))  # Display the first 5 rows after removing stopwords

In [None]:
"""
Normally would need to tokenize 
but TF-IDF work with raw text and tokenizes automatically
"""
from sklearn.feature_extraction.text import TfidfVectorizer
#limit the number of unique words to 10000, max_df=0.8 means ignore words that appear in more than 80% of the documents
tfidf = TfidfVectorizer(max_features=80000,max_df=0.8)
#return a sparse matrix of TF-IDF features

X = tfidf.fit_transform(df['FullReview'])
#print shape of matrix so number of reviews and number of unique words
print(X.shape)  

In [None]:
# Check for null values in the DataFrame 
#delete row with null values 
for col in df.columns:
    if df[col].isnull().any():
        print(f"Column '{col}' has null values. Dropping rows with null values.")
        df = df.dropna(subset=[col])


In [None]:

def label_sentiment(score):
    if score in [4, 5]:
        return 'Positive'
    elif score == 3:
        return 'Neutral'
    else:
        return 'Negative'

df['Sentiment'] = df['Score'].apply(label_sentiment)
print(df['Sentiment'].value_counts())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,log_loss
import numpy as np
import joblib

# Split the data into training and testing sets
"""using Linear Regression
    since it assumes correlation between features and target variable
    we are trying to predict what a review rating would be
    based on text in the review 
"""
#df['Sentiment'] = df['Score'].apply(lambda x: 1 if x >= 3 else 0)
  
def run_model(df, X_tfidf):
    # using the TF-IDF scores for X, because depending on the score of the tfidf the model will be able to predict if the word likely to correlate with a score 
    X = X_tfidf
    y = df['Sentiment']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs',class_weight={'Positive': 1, 'Neutral': 5, 'Negative': 3})
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred), "\n")
    print("Accuracy Score:", accuracy_score(y_test, y_pred), "\n")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    y_pred_proba = model.predict_proba(X_test)
    loss = log_loss(y_test, y_pred_proba)
    print("Log Loss:", loss, "\n")
    with open("results.txt", "a") as f:
        f.write("Final model run 88%\n")
        f.write("Confusion Matrix:\n" + str(confusion_matrix(y_test, y_pred)) + "\n")
        f.write("Accuracy Score: \n" + str(accuracy_score(y_test, y_pred)) + "\n")
        f.write("Classification Report:\n" + classification_report(y_test, y_pred) + "\n")
        f.write("Log Loss: " + str(loss) + "\n")
        f.close()
    # Save the model to a file
    joblib.dump(model, 'sentiment_model.pkl')
    return X_train, X_test, y_train, y_test,model
    
    
print("Starting Testing")    
run_model(df, X)
# Call the function to perform train-test split and model training