# Introduction
The goal of this project is to gather business data using APIs, preprocess it, build machine learning models, optimize hyperparameters, and deploy the best model using Streamlit.


# Business Problem
Businesses must analyze customer reviews to improve services and attract more customers. By leveraging machine learning, businesses can predict performance based on reviews and location insights, enabling data-driven decision-making.

# Problem Statement
The aim is to develop a machine learning model that predicts business performance using customer reviews. This model will help businesses identify key success factors and make informed decisions. To achieve this, we will create predictive models using supervised machine learning algorithms.


In [831]:
import requests
import re
import nltk
import sklearn
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from optuna.samplers import GridSampler,RandomSampler
import optuna

## API Request Code
This Python script sends a GET request to the local-business-data.p.rapidapi.com API using the requests library to search for pizza places within a specified location.

In [834]:
import requests

url = "https://local-business-data.p.rapidapi.com/search-in-area"

querystring = {"query":"pizza","lat":"37.359428","lng":"-121.925337","zoom":"13","limit":"20","language":"en","region":"us","extract_emails_and_contacts":"false"}

headers = {
	"x-rapidapi-key": "8c3d25328fmshd598205a362f267p1ba927jsnf0ec88b37ae4",
	"x-rapidapi-host": "local-business-data.p.rapidapi.com"
}

response = requests.get(url, headers=headers, params=querystring)


In [9]:
response.json().keys()

dict_keys(['status', 'request_id', 'parameters', 'data'])

In [15]:
data = pd.json_normalize(response.json(),record_path='data')

In [32]:
data.to_csv(r"C:\Users\HP\Videos\New folder\output.csv", index=False)

In [57]:
data =pd.read_csv(r"C:\Users\HP\Videos\New folder\output.csv")

In [61]:
url = "https://local-business-data.p.rapidapi.com/business-reviews"

headers = {
    "x-rapidapi-key": "8c3d25328fmshd598205a362f267p1ba927jsnf0ec88b37ae4",
    "x-rapidapi-host": "local-business-data.p.rapidapi.com"
}

all_reviews = []  

for business_id in data["business_id"]: 
    querystring = {
        "business_id": business_id,
        "limit": "20",
        "sort_by": "newest",
        "region": "us",
        "language": "en"
    }

    response = requests.get(url, headers=headers, params=querystring)

    if response.status_code == 200 and "data" in response.json():
        reviews = pd.json_normalize(response.json(), record_path="data")
        all_reviews.append(reviews)
    else:
        print(f"No data found for {business_id}")


final_reviews = pd.concat(all_reviews, ignore_index=True)


  final_reviews = pd.concat(all_reviews, ignore_index=True)


In [63]:
final_reviews.to_csv(r"C:\Users\HP\Videos\New folder\reviews.csv", index=False)


In [85]:
data2 = pd.read_csv(r"C:\Users\HP\Videos\New folder\reviews.csv")

## Data containing information on review from each business ID

In [87]:
fd = data2[['review_text','rating']]

In [89]:
final_data1= fd[~fd['review_text'].isnull()]

## Performing Simple EDA

The SEDA function analyzes a given text column in a DataFrame and detects various text patterns such as uppercase/lowercase mix, HTML tags, URLs, email addresses, mentions, special characters, and emojis.

In [99]:
def SEDA(data,column_name):
    import emoji
    
    lower_upper=data[column_name].apply(lambda x:False if (x.islower()) or(x.isupper()) else True).sum()
    
    tags=data[column_name].apply(lambda x:True if re.search("<.+?>",x) else False).sum()
    
    urls = data[column_name].apply(lambda x: True if re.search("http[s]://\\S+", x) else False).sum()
    
    mail_id = data[column_name].apply(lambda x: True if re.search("\\S+@\\S+", x) else False).sum()
    
    mentions = data[column_name].apply(lambda x: True if re.search("\\B[@#]\\S+", x) else False).sum()
    
    special_chara = data[column_name].apply(lambda x: True if re.search("[0-9%*&'\\?\\[\\]]", x) else False).sum()
    
    emoji1=data[column_name].apply(lambda x:True if emoji.emoji_count(x)>0 else False).sum()

 
    
    if lower_upper>0:
        print("Not in lower or upper case")
    if tags>0:
        print("tags present")
    if urls>0:
        print("urls present")
    if mail_id>0:
        print("mail_ids present")
    if mentions>0:
        print("mentions present")
    if special_chara>0:
        print("special_chara present")
    if emoji1>0:
        print("emoji present")

In [101]:
SEDA(final_data1,"review_text")

Not in lower or upper case
mail_ids present
special_chara present
emoji present


## Text Preprocessing Function

The pre_processing function is a text-cleaning utility that removes unwanted elements such as HTML tags, URLs, mentions, emails, punctuation, digits, emojis, and stopwords while also offering stemming and lemmatization options.

In [103]:
def pre_processing(data,column_name,lowercase=None,tags=True,urls=True,mentions=True,mail_ids=True,punctuation=True,digits=True,emojis=True,stopwordss=True,stem_lem=True,stemm=True):
    
    import emoji
    import re
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer,WordNetLemmatizer
    
    ps=PorterStemmer()
    wl=WordNetLemmatizer()

    stp=stopwords.words("english")
    stp.remove("not")
    
    # lower or upper
    if lowercase==True:
        data[column_name]=data[column_name].str.lower()
    elif lowercase==False:
        data[column_name]=data[column_name].str.upper()
    else:
        pass
    
    # tags removal
    if tags==True:
        data[column_name]=data[column_name].apply(lambda x:re.sub("<.+?>","",x))
    else:
        pass
    
    # urls removal
    if urls==True:
        data[column_name] = data[column_name].apply(lambda x: re.sub("http[s]://\\S+", "", x))
        
    else:
        pass
    
    # mentions removal
    if mentions==True:
        data[column_name] = data[column_name].apply(lambda x: re.sub("\\B[@#]\\S+", "", x))
    else:
        pass

    # mail-ids
    if mail_ids==True:
        data[column_name] = data[column_name].apply(lambda x: re.sub("\\S+@\\S+", "", x))
        
    else:
        pass
    
    # punctuation
    if punctuation==True:
        data[column_name] = data[column_name].apply(lambda x: re.sub("[!\"#$%&'()*+,-./:;<=>?@\\^_`{|}~]", "", x))
    else:
        pass
    
    #digits
    
    if digits==True:
        data[column_name]=data[column_name].apply(lambda x:re.sub("[0-9]","",x))
    else:
        pass
    
    #emojis
    if emojis==True:
        data[column_name]=data[column_name].apply(lambda x:emoji.demojize(x,delimiters=('', '')))
    else:
        pass
    
    if stopwordss==True:
        d=[]
        for sent in data[column_name]:
            l=[]
            for word in sent.split():
                if word not in stp:
                    l.append(word)
            d.append([" ".join(l)])
            
        data[column_name]=pd.DataFrame(d)
    else:
        pass
    
    
    if stem_lem==True:
        d=[]
        for sent in data[column_name]:
            l=[]
            if stemm==True:
                for word in sent.split():
                        l.append(ps.stem(word))
            else:
                for word in sent.split():
                        l.append(wl.lemmatize(word))
            d.append([" ".join(l)])
        data[column_name]=pd.DataFrame(d)
    else:
        pass
        
            
    
    return data
   

In [None]:
final = pre_processing(final_data1,"review_text",lowercase=True,tags=False,urls=False,mentions=False,mail_ids=False,punctuation=True,digits=True,emojis=True,stopwordss=False,stem_lem=False,stemm=False)

In [108]:
final.to_csv(r"C:\Users\HP\Videos\New folder\final.csv", index=False)


In [617]:
final= pd.read_csv(r"C:\Users\HP\Videos\New folder\final.csv")
final.head()

Unnamed: 0,review_text,rating
0,delicious,4
1,the workers are sweet and kind food amazing,5
2,preet was very helpful and cordial he keeps th...,5
3,good service very tasty food,5
4,very good and tasty place,5


## The CountVectorizer from sklearn.

feature_extraction.text converts a collection of text documents into a bag-of-words (BoW) representation. It tokenizes text, removes punctuation, and creates a sparse matrix where each row corresponds to a document and each column represents a word's frequency.

In [732]:
tf = TfidfVectorizer()
final_data = tf.fit_transform(final["review_text"])

In [734]:
vector_data =pd.DataFrame(final_data.toarray())

In [736]:
vector_data.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [738]:
final["rating"].index= range(0,254)

In [740]:
vector_data["rating"]= final["rating"]


In [742]:
fv = vector_data.iloc[:, :-1]

cv = vector_data["rating"]   


In [744]:
x_train, x_test, y_train, y_test = train_test_split(fv, cv, test_size=0.2, stratify=cv, random_state=42)


##  Standardization using `StandardScaler`

`StandardScaler` from `sklearn.preprocessing` to normalize the feature values in `x_train` and `x_test`. Standardization ensures that the data has a mean of **0** and a standard deviation of **1**.


In [746]:
std = StandardScaler(with_mean=False)
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)


In [748]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)


(203, 2001)
(51, 2001)
(203,)
(51,)


## KNN

In [751]:
def objective(trial):
    neighbors = trial.suggest_int("neighbors",1,5)
    weight = trial.suggest_categorical("weight",["uniform","distance"])
    algorithm= trial.suggest_categorical("algorithm",["ball_tree", "kd_tree", "brute"])
    p= trial.suggest_int("p",1,2)

    knn= KNeighborsClassifier(n_neighbors=neighbors, weights=weight, algorithm=algorithm,p=p)

    cv_acc = cross_validate(knn, x_train, y_train, cv=5, scoring="accuracy")["test_score"].mean()



    return cv_acc

In [753]:
#create a study
study = optuna.create_study(direction="maximize", sampler=RandomSampler())

[I 2025-03-17 23:29:14,850] A new study created in memory with name: no-name-cb8d9337-a428-4cd6-af9b-d24f34e4e239


In [755]:
study.optimize(objective, n_trials=50)

[I 2025-03-17 23:29:18,145] Trial 0 finished with value: 0.21146341463414636 and parameters: {'neighbors': 3, 'weight': 'distance', 'algorithm': 'brute', 'p': 1}. Best is trial 0 with value: 0.21146341463414636.
[I 2025-03-17 23:29:18,447] Trial 1 finished with value: 0.20658536585365855 and parameters: {'neighbors': 3, 'weight': 'uniform', 'algorithm': 'ball_tree', 'p': 1}. Best is trial 0 with value: 0.21146341463414636.
[I 2025-03-17 23:29:18,806] Trial 2 finished with value: 0.4801219512195122 and parameters: {'neighbors': 5, 'weight': 'distance', 'algorithm': 'kd_tree', 'p': 1}. Best is trial 2 with value: 0.4801219512195122.
[I 2025-03-17 23:29:18,977] Trial 3 finished with value: 0.21146341463414636 and parameters: {'neighbors': 3, 'weight': 'distance', 'algorithm': 'brute', 'p': 1}. Best is trial 2 with value: 0.4801219512195122.
[I 2025-03-17 23:29:19,055] Trial 4 finished with value: 0.5571951219512196 and parameters: {'neighbors': 2, 'weight': 'distance', 'algorithm': 'brute

In [757]:
study.best_params

{'neighbors': 2, 'weight': 'distance', 'algorithm': 'brute', 'p': 2}

In [759]:
study.best_value

0.5571951219512196

In [722]:
knn_best = KNeighborsClassifier(n_neighbors=2, weights='uniform', algorithm='brute', p=2)
knn_best.fit(x_train, y_train)

# Predict on test data
y_pred = knn_best.predict(x_test)


In [724]:
acc = classification_report(y_test, y_pred)
print(acc)

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         2
           4       0.14      1.00      0.24         6
           5       1.00      0.23      0.37        35

    accuracy                           0.27        51
   macro avg       0.23      0.25      0.12        51
weighted avg       0.70      0.27      0.28        51



### Model Evaluation:

Comparing predictions with actual ratings to assess performance.


In [829]:
# Predict on new review(s)
def predict_rating(new_reviews):
    new_reviews = tf.transform(new_reviews)  # Transform new reviews
    new_reviews_s = std.transform(new_reviews)  # Scale new data
    predictions = knn_best.predict(new_reviews_s)  # Predict ratings
    return predictions


In [850]:
# Demonstration
new_reviews = ["The food was amazing!", "Service was Horrible."]
predicted_ratings = predict_rating(new_reviews)
print(predicted_ratings)

[5 4]


In [823]:
# Demonstration
new_reviews = ["The food was worst", "Service terrible."]
predicted_ratings = predict_rating(new_reviews)
print(predicted_ratings)

[4 4]


## Logistic Regression

In [827]:
 
def objective(trial):
    combination = trial.suggest_categorical("combination", [
        ("l2", "lbfgs"), ("l2", "liblinear"), 
        ("l1", "liblinear"), ("elasticnet", "saga")
    ])
    
    C = trial.suggest_categorical("C", [0.001, 0.01, 0.1, 1, 10, 100])
    
    if combination[0] == "elasticnet":
        l1_ratio = trial.suggest_categorical("l1_ratio", [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
        model1 = LogisticRegression(
            fit_intercept=False, penalty=combination[0], 
            C=C, solver=combination[1], l1_ratio=l1_ratio
        )
    else:
        model1 = LogisticRegression(
            fit_intercept=False, penalty=combination[0], 
            C=C, solver=combination[1]
        )

    cv_acc = cross_validate(model, x_train, y_train, cv=5, scoring="accuracy")["test_score"].mean()
    
    return cv_acc


In [770]:
study = optuna.create_study(direction="maximize")

[I 2025-03-17 23:37:23,529] A new study created in memory with name: no-name-fb47e0f5-6b96-430a-b1f5-5be74ed31d17


In [776]:
study.optimize(objective, n_trials=50)

[I 2025-03-17 23:42:26,907] Trial 0 finished with value: 0.675 and parameters: {'combination': ('l2', 'lbfgs'), 'C': 0.001}. Best is trial 0 with value: 0.675.
[I 2025-03-17 23:42:27,396] Trial 1 finished with value: 0.675 and parameters: {'combination': ('l1', 'liblinear'), 'C': 100}. Best is trial 0 with value: 0.675.
[I 2025-03-17 23:42:27,855] Trial 2 finished with value: 0.675 and parameters: {'combination': ('l2', 'liblinear'), 'C': 100}. Best is trial 0 with value: 0.675.
[I 2025-03-17 23:42:28,378] Trial 3 finished with value: 0.675 and parameters: {'combination': ('l1', 'liblinear'), 'C': 0.01}. Best is trial 0 with value: 0.675.
[I 2025-03-17 23:42:28,934] Trial 4 finished with value: 0.675 and parameters: {'combination': ('l2', 'lbfgs'), 'C': 0.01}. Best is trial 0 with value: 0.675.
[I 2025-03-17 23:42:29,565] Trial 5 finished with value: 0.675 and parameters: {'combination': ('l2', 'liblinear'), 'C': 0.01}. Best is trial 0 with value: 0.675.
[I 2025-03-17 23:42:30,096] Tri

In [784]:
 best_params= study.best_params

In [786]:
 study.best_value

0.675

In [788]:
model1= LogisticRegression(penalty=best_params['combination'][0], solver=best_params['combination'][1],C=best_params['C'],fit_intercept=False)


### Model Evaluation:

Comparing predictions with actual ratings to assess performance.


In [790]:
# Train the model
model1.fit(x_train, y_train)

# Predict on test data
y_pred = model1.predict(x_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.67


In [796]:
# Predict on new review(s)
def predict_ratings(new_reviews):
    new_reviews = tf.transform(new_reviews)  # Transform new reviews
    new_reviews_s = std.transform(new_reviews)  # Scale new data
    predictions = model1.predict(new_reviews_s)  # Predict ratings
    return predictions

In [809]:
# Demonstration
new_reviews = ["The food was worst", "Service terrible."]
predicted_ratings = predict_ratings(new_reviews)
print(predicted_ratings)

[1 1]


In [846]:
# Demonstration
new_reviews = ["The food was amazing!", "Service was Horrible "]
predicted_ratings = predict_ratings(new_reviews)
print(predicted_ratings)

[5 2]


## Decision Tree

In [860]:
# Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(criterion="entropy", max_depth=5, random_state=42)
dt_model.fit(x_train, y_train)

# Predict on test data
y_pred = dt_model.predict(x_test)



In [862]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))


Accuracy: 0.71
              precision    recall  f1-score   support

           1       0.50      0.14      0.22         7
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         2
           4       0.40      0.33      0.36         6
           5       0.77      0.94      0.85        35

    accuracy                           0.71        51
   macro avg       0.33      0.28      0.29        51
weighted avg       0.64      0.71      0.65        51



In [864]:
# Predict on new review(s)
def predict_rat(new_reviews):
    new_reviews = tf.transform(new_reviews)  # Transform new reviews
    new_reviews_s = std.transform(new_reviews)  # Scale new data
    predictions = dt_model.predict(new_reviews_s)  # Predict ratings
    return predictions

In [858]:
# Demonstration
new_reviews = ["The food was worst", "Service terrible."]
predicted_ratings = predict_rat(new_reviews)
print(predicted_ratings)

[5 5]
