Assignment 1
==============

Goal: Using yelp dataset create a model that can classify the sentiment of the user.

Feature: text
Label: stars


Interpretation of stars
 5,4,3 -> Happy
 2,1 -> Sad

In [24]:
# !pip install -q streamlit

# !npm install localtunnel

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
import re

# Download stopwords for NLTK
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oysterable/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
df = pd.read_csv('./Datasets/yelp.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   business_id  10000 non-null  object
 1   date         10000 non-null  object
 2   review_id    10000 non-null  object
 3   stars        10000 non-null  int64 
 4   text         10000 non-null  object
 5   type         10000 non-null  object
 6   user_id      10000 non-null  object
 7   cool         10000 non-null  int64 
 8   useful       10000 non-null  int64 
 9   funny        10000 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 781.4+ KB


In [27]:
sub_df = df[['stars','text']]
sub_df

Unnamed: 0,stars,text
0,5,My wife took me here on my birthday for breakf...
1,5,I have no idea why some people give bad review...
2,4,love the gyro plate. Rice is so good and I als...
3,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!..."
4,5,General Manager Scott Petello is a good egg!!!...
...,...,...
9995,3,First visit...Had lunch here today - used my G...
9996,4,Should be called house of deliciousness!\n\nI ...
9997,4,I recently visited Olive and Ivy for business ...
9998,2,My nephew just moved to Scottsdale recently so...


In [28]:
df.stars.value_counts()

stars
4    3526
5    3337
3    1461
2     927
1     749
Name: count, dtype: int64

In [29]:
# Interpretation of stars
#  5,4,3 -> Happy
#  2,1 -> Sad

labels = df['stars'].apply(lambda x: 'happy' if int(x)>2 else 'sad').reset_index(drop=True)
labels

0       happy
1       happy
2       happy
3       happy
4       happy
        ...  
9995      sad
9996    happy
9997    happy
9998      sad
9999    happy
Name: stars, Length: 10000, dtype: object

Check Bias

In [30]:
labels.value_counts()

stars
happy    6863
sad      3137
Name: count, dtype: int64

In [31]:
features = df['text']
features

0       My wife took me here on my birthday for breakf...
1       I have no idea why some people give bad review...
2       love the gyro plate. Rice is so good and I als...
3       Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
4       General Manager Scott Petello is a good egg!!!...
                              ...                        
9995    First visit...Had lunch here today - used my G...
9996    Should be called house of deliciousness!\n\nI ...
9997    I recently visited Olive and Ivy for business ...
9998    My nephew just moved to Scottsdale recently so...
9999    4-5 locations.. all 4.5 star average.. I think...
Name: text, Length: 10000, dtype: object

In [32]:
#Preprocess the text data
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d', '', text)  # Remove numbers
    stop_words = set(stopwords.words('english'))
    words = text.split()
    text = ' '.join([word for word in words if word not in stop_words])
    return text

In [33]:
# Apply preprocessing to features
features = features.apply(preprocess_text)

labels = labels.map({'happy': 1, 'sad': 0})

In [34]:
#Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [35]:
#Vectorize the text using TF-IDF (Term Frequency-Inverse Document Frequency)
vectorizer = TfidfVectorizer(max_features=3000)
vectorizer

In [36]:

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [37]:
#Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [38]:
#Check Quality of the Model

print(f"Testing score is {model.score(X_test_tfidf,y_test)} and TrainingScore is {model.score(X_train_tfidf,y_train)} ")

Testing score is 0.825 and TrainingScore is 0.883125 


In [39]:
#Classification report
from sklearn.metrics import classification_report

print(classification_report(y_test,model.predict(X_test_tfidf)))

              precision    recall  f1-score   support

           0       0.79      0.57      0.66       597
           1       0.84      0.94      0.88      1403

    accuracy                           0.82      2000
   macro avg       0.81      0.75      0.77      2000
weighted avg       0.82      0.82      0.82      2000



In [40]:
user_input = input("Enter a bussiness Review text to classify: ")
processed_input = preprocess_text(user_input)  # Preprocess the user input
input_tfidf = vectorizer.transform([processed_input])  # Transform the input to the same TF-IDF format
prediction = model.predict(input_tfidf)

if prediction[0] == 1:
  print("This is a HAPPY customer.")
else:
  print("This is a SAD customer.")


This is a SAD customer.


In [41]:
prediction

array([0])

## Deploy Model

In [42]:
import joblib
joblib.dump(model, 'review_classifier_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

Create Streamlit App

In [43]:
%%writefile app.py

import streamlit as st
import joblib
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords if needed
nltk.download('stopwords')

# Load the pre-trained model and vectorizer
model = joblib.load('review_classifier_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Preprocess the text similar to how the model was trained
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d', '', text)  # Remove numbers
    stop_words = set(stopwords.words('english'))
    words = text.split()
    text = ' '.join([word for word in words if word not in stop_words])
    return text

# Streamlit App UI
st.title("Customer Review Classifier")

# Text input
user_input = st.text_input("Enter a business review:")

if st.button("Classify"):
    if user_input:
        # Preprocess user input
        processed_input = preprocess_text(user_input)

        # Vectorize the input text
        input_tfidf = vectorizer.transform([processed_input])

        # Get the prediction
        prediction = model.predict(input_tfidf)

        # Output result
        if prediction[0] == 1:
            st.write("This is a **HAPPY** review.")
        else:
            st.write("This is a **SAD** review.")
    else:
        st.write("Please enter a review to classify.")


Overwriting app.py


Run a local server and deploy to a public VPN using LocalTunnel

In [44]:
import urllib

print("Password/Enpoint IP for localtunnel is:", urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip("\n"))

Password/Enpoint IP for localtunnel is: 172.56.234.148


Run the app and expose to localtunnel server

In [45]:
!streamlit run app.py &>./content/logs.txt & npx localtunnel --port 8501

zsh:1: command not found: npx


In [23]:
## Run this command in the terminal...
# streamlit run app.py &>./content/logs.txt & npx localtunnel --port 8501

# you will get a url like this:
# https://icy-cloths-appear.loca.lt