<a href="https://colab.research.google.com/github/mubarak6969/bussinessWebsite/blob/main/EcoCheck.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
pip install pandas numpy scikit-learn tensorflow flask simplejson


Collecting simplejson
  Downloading simplejson-3.19.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.1 kB)
Downloading simplejson-3.19.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (137 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.9/137.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: simplejson
Successfully installed simplejson-3.19.2


In [5]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, Model, Input

# Load the business listings data
file_path = 'yelp_academic_dataset_business.json'

# Read the JSON file line by line
data = []
with open(file_path, 'r') as file:
    for line in file:
        try:
            record = json.loads(line)
            data.append(record)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON at line: {e}")
            continue

# Convert to DataFrame
data = pd.DataFrame(data)
print(data.head())

# Check for empty data
if data.empty:
    raise ValueError("The dataset is empty. Please check the file content.")


Error decoding JSON at line: Unterminated string starting at: line 1 column 601 (char 600)
              business_id                      name  \
0  Pns2l4eNsfO8kk83dixA6A  Abby Rappoport, LAC, CMQ   
1  mpf3x-BjTdTEA3yCZrAYPw             The UPS Store   
2  tUFrWirKiKi_TAnsVWINQQ                    Target   
3  MTSW4McQd7CbVtyjqoe9mw        St Honore Pastries   
4  mWMc6_wTdE0EUBKIGXDVfA  Perkiomen Valley Brewery   

                           address           city state postal_code  \
0           1616 Chapala St, Ste 2  Santa Barbara    CA       93101   
1  87 Grasso Plaza Shopping Center         Affton    MO       63123   
2             5255 E Broadway Blvd         Tucson    AZ       85711   
3                      935 Race St   Philadelphia    PA       19107   
4                    101 Walnut St     Green Lane    PA       18054   

    latitude   longitude  stars  review_count  is_open  \
0  34.426679 -119.711197    5.0             7        0   
1  38.551126  -90.335695    3.0    

In [6]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and strip
    return text

# Apply preprocessing to relevant columns
data['name'] = data['name'].apply(preprocess_text)
data['address'] = data['address'].apply(preprocess_text)
data['city'] = data['city'].apply(preprocess_text)
data['categories'] = data['categories'].apply(lambda x: preprocess_text(x) if isinstance(x, str) else '')

# Combine relevant fields into a single string for comparison
data['combined'] = data['name'] + ' ' + data['address'] + ' ' + data['city'] + ' ' + data['categories']

# Display the preprocessed data
print(data[['combined']].head())


                                            combined
0  abby rappoport lac cmq 1616 chapala st ste 2 s...
1  the ups store 87 grasso plaza shopping center ...
2  target 5255 e broadway blvd tucson department ...
3  st honore pastries 935 race st philadelphia re...
4  perkiomen valley brewery 101 walnut st green l...


In [8]:
# Create pairs of business listings
def create_pairs(data):
    pairs = []
    labels = []
    for i in range(len(data)):
        for j in range(i + 1, len(data)):
            pair = [data['combined'].iloc[i], data['combined'].iloc[j]]
            # Label as 1 if they are duplicates, 0 otherwise (for this example, we assume random pairing)
            label = 1 if data['business_id'].iloc[i] == data['business_id'].iloc[j] else 0
            pairs.append(pair)
            labels.append(label)
    return np.array(pairs), np.array(labels)

# Create pairs and labels
pairs, labels = create_pairs(data)

# Split the data into training and test sets
pairs_train, pairs_test, labels_train, labels_test = train_test_split(pairs, labels, test_size=0.2, random_state=42)

print(f"Number of training pairs: {len(pairs_train)}")
print(f"Number of test pairs: {len(pairs_test)}")


KeyboardInterrupt: 

In [43]:
def create_sample_pairs(data, n_samples=1000):
    pairs = []
    labels = []
    for _ in range(n_samples):
        idx_a, idx_b = np.random.choice(len(data), 2, replace=False)
        pair = [data['combined'].iloc[idx_a], data['combined'].iloc[idx_b]]
        label = 1 if data['business_id'].iloc[idx_a] == data['business_id'].iloc[idx_b] else 0
        if label == 0 and np.random.rand() > 0.3:  # Lower the threshold for skipping non-duplicate pairs
            continue
        pairs.append(pair)
        labels.append(label)
    return np.array(pairs), np.array(labels)



pairs, labels = create_sample_pairs(data)

# Split the data into training and test sets
pairs_train, pairs_test, labels_train, labels_test = train_test_split(pairs, labels, test_size=0.2, random_state=42)



In [52]:
# Define a simple base network
from tensorflow.keras.regularizers import l2

def create_base_network(input_shape):
    input = Input(shape=input_shape)
    x = layers.Dense(64, activation='relu', kernel_regularizer=l2(0.01))(input)  # Add L2 regularization
    x = layers.Dropout(0.7)(x)  # Increased dropout rate
    x = layers.Dense(32, activation='relu', kernel_regularizer=l2(0.01))(x)  # Add L2 regularization
    x = layers.Dropout(0.7)(x)  # Increased dropout rate
    return Model(input, x)


input_shape = (300,)
base_network = create_base_network(input_shape)

input_a = Input(shape=input_shape)
input_b = Input(shape=input_shape)

processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = layers.Lambda(lambda x: tf.keras.backend.abs(x[0] - x[1]))([processed_a, processed_b])
output = layers.Dense(1, activation='sigmoid')(distance)

model = Model([input_a, input_b], output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Convert the text pairs to numeric form using TF-IDF
vectorizer = TfidfVectorizer(max_features=300)
X_train_a = vectorizer.fit_transform(pairs_train[:, 0]).toarray()
X_train_b = vectorizer.transform(pairs_train[:, 1]).toarray()
X_test_a = vectorizer.transform(pairs_test[:, 0]).toarray()
X_test_b = vectorizer.transform(pairs_test[:, 1]).toarray()

# Train the model
history = model.fit([X_train_a, X_train_b], labels_train,
                    validation_data=([X_test_a, X_test_b], labels_test),
                    batch_size=64, epochs=6)  # Reduced number of epochs




Epoch 1/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 82ms/step - accuracy: 0.8348 - loss: 2.0672 - val_accuracy: 1.0000 - val_loss: 2.0427
Epoch 2/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9037 - loss: 1.9474 - val_accuracy: 1.0000 - val_loss: 1.9428
Epoch 3/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9344 - loss: 1.8388 - val_accuracy: 1.0000 - val_loss: 1.8510
Epoch 4/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9657 - loss: 1.7030 - val_accuracy: 1.0000 - val_loss: 1.7670
Epoch 5/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9931 - loss: 1.5960 - val_accuracy: 1.0000 - val_loss: 1.6897
Epoch 6/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9957 - loss: 1.4763 - val_accuracy: 1.0000 - val_loss: 1.6182


In [48]:

# Evaluate the model
loss, accuracy = model.evaluate([X_test_a, X_test_b], labels_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 1.0000 - loss: 1.6077 
Test Accuracy: 100.00%


In [53]:
import pandas as pd

# Load the dataset
data = pd.read_json('yelp_academic_dataset_business.json', lines=True)

# Combine relevant columns for text-based comparison
data['combined'] = data['name'] + ' ' + data['address'] + ' ' + data['categories'].fillna('') + ' ' + data['city']


In [54]:
# Example: Select two businesses by their index or other unique identifier
business_a = data.iloc[10]  # 10th business in the dataset
business_b = data.iloc[20]  # 20th business in the dataset

# Extract the combined text for each business
input_a = business_a['combined']
input_b = business_b['combined']

print(f"Business A: {input_a}")
print(f"Business B: {input_b}")


Business A: Marshalls 21705 Village Lakes Sc Dr Department Stores, Shopping, Fashion Land O' Lakes
Business B: Roast Coffeehouse and Wine Bar 10359 104 Street NW Coffee & Tea, Food, Cafes, Bars, Wine Bars, Restaurants, Nightlife Edmonton


In [55]:
# Transform the inputs into TF-IDF vectors using the vectorizer fitted during training
input_a_vec = vectorizer.transform([input_a]).toarray()
input_b_vec = vectorizer.transform([input_b]).toarray()


In [56]:
# Make a prediction using the trained model
prediction = model.predict([input_a_vec, input_b_vec])

# Convert the prediction to a binary label
predicted_label = 1 if prediction >= 0.5 else 0

# Output the result
if predicted_label == 1:
    print("The businesses are likely duplicates.")
else:
    print("The businesses are not duplicates.")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 207ms/step
The businesses are not duplicates.
