In [None]:
from pymongo import MongoClient
import pandas as pd

client = MongoClient('localhost', 27017)
db = client['test']
collection = db['test']
if isinstance(scraped_data, list) and scraped_data:
    result = collection.insert_many(scraped_data)
else:
    print("scraped_data is not a list or is empty.")

# Fetch data
data = list(collection.find({"cuisine": "Asian"}))

# Create DataFrame
df = pd.DataFrame(data)

# Convert ratings and number of ratings to numeric values
df['num_ratings'] = pd.to_numeric(df['num_ratings'], errors='coerce')
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Calculate statistics
statistics = df.groupby('cuisine').agg({
    'num_ratings': ['count', 'quantile', lambda x: x.quantile(0.75)],
    'rating': ['mean', 'quantile', lambda x: x.quantile(0.75)]
}).rename(columns={'<lambda_0>': '75th percentile', 'quantile': '50th percentile'})

# Output results
print(statistics)


In [None]:

# Connect to MongoDB and access the database and collection
client = MongoClient('mongodb://localhost:27017/')
db = client['tdr_grubhub']
collection = db['grubhub_restaurant_urls']

# Export data from MongoDB to a Pandas DataFrame
df = pd.DataFrame(list(collection.find()))

# Pandas Functions for Cleaning and Pre-processing Data
# 1. df.dropna() - Remove missing values.
# 2. df.fillna() - Fill missing values.
# 3. df.astype() - Cast a pandas object to a specified dtype.
# 4. df.replace() - Replace values given in 'to_replace' with 'value'.
# 5. pd.to_datetime() - Convert argument to datetime.
# 6. df.drop_duplicates() - Remove duplicate rows.

# Clean and preprocess the data
# Convert 'impression_rank' and 'rating' to numeric, setting errors='coerce' will set non-convertible values to NaN
df['impression_rank'] = pd.to_numeric(df['impression_rank'], errors='coerce')
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# (0) Distribution of Impression Rank
impression_rank_percentiles = df['impression_rank'].dropna().quantile([0.25, 0.5, 0.75])

# (1) Distribution of Ratings
ratings_percentiles = df['rating'].dropna().quantile([0.25, 0.5, 0.75])

# # (2) Total number of rows with scrape attempted is true vs. false
# scrape_attempted_counts = df['scrape_attempted'].value_counts()

# # Check if 'True' exists in the Series and calculate the percentage
# if True in scrape_attempted_counts:
#     scrape_attempted_percentage = scrape_attempted_counts[True] / df.shape[0] * 100
# else:
#     scrape_attempted_percentage = 0
    
# # If the column is not boolean, you might need to convert it or compare with the string 'True' instead
# # Example:
# # scrape_attempted_counts = df['scrape_attempted'].value_counts(normalize=True)
# # scrape_attempted_percentage = scrape_attempted_counts.get("True", 0) * 100

# print(df['scrape_attempted'].unique())

# (3) Number of instances with "not found" in each column
not_found_counts = df.apply(lambda x: x.str.contains('not found', case=False, na=False).sum() if x.dtype == "object" else 0).sort_values(ascending=False)

# (4) Number of instances with no value in each column
no_value_counts = df.isna().sum().sort_values(ascending=False)

# (5) Number of malformed rows (more than 3 fields missing)
malformed_rows_count = df.isna().sum(axis=1)[df.isna().sum(axis=1) > 3].count()

# (6) Number of rows in each category
category_counts = df['category'].value_counts()

# (7) Number of instances where the URL does not say Grubhub
non_grubhub_urls_count = df['url'].apply(lambda x: 'grubhub' not in x.lower()).sum()

# Display the results
impression_rank_percentiles, ratings_percentiles, scrape_attempted_counts, not_found_counts, no_value_counts, malformed_rows_count, category_counts, non_grubhub_urls_count
