In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import chi2_contingency


# Load the data


In [None]:
#load the json file with pandas library into a data frame
with open('products.json', 'r') as file:
    data = json.load(file)

df = pd.json_normalize(data)
print(df.head(1))

# Clean the data


In [None]:
# Check for missing values
print(df.isnull().sum())

# Convert 'created_date' to datetime
df['created_date'] = pd.to_datetime(df['created_date'], unit='s')


# Average rating is incorrect
This value doesn't reflect the average of the ratings of the reviews

In [None]:
print(df[['product_id', 'average_rating']])

In [None]:
# Extracting reviews for analysis
reviews_df = pd.json_normalize(data, record_path=['reviews'], meta=['product_id'])

# Calculate the average rating from reviews for each product
average_ratings = reviews_df.groupby('product_id')['rating'].mean().reset_index()

# Update the 'average_rating' field in the original DataFrame
df = df.merge(average_ratings, on='product_id', suffixes=('', '_calculated'))
df['average_rating'] = df['rating_calculated']
df.drop(columns=['rating_calculated'], inplace=True)

print(df[['product_id', 'average_rating']])

# Exploratory Data Analysis
- Basic analysis
- What items have the most reviews?
- Correlation between available color and ratings?


In [None]:
# Basic statistical analysis
print(df.describe())

# Data types and null values
print(df.info())

In [None]:
highest_rating = df['average_rating'].max()
highest_rated_products = df[df['average_rating'] == highest_rating]

# Sort and display top 5 highest-rated products
top_n = 5
top_highest_rated_products = df.sort_values(by='average_rating', ascending=False).head(top_n)
print(f"\nTop {top_n} Highest-Rated Products:")
print(top_highest_rated_products[['product_id', 'average_rating', 'type', 'material', 'color', 'price']])

In [None]:
# Extract reviews and merge with product data to get color information
reviews_df = pd.json_normalize(data, record_path=['reviews'], meta=['product_id'])
product_color_df = df[['product_id', 'color']]

# Merge reviews with product colors
reviews_with_color = reviews_df.merge(product_color_df, on='product_id')

# Categorize ratings into high and low for simplicity
reviews_with_color['rating_category'] = reviews_with_color['rating'].apply(lambda x: 'high' if x > 3 else 'low')

# Create a contingency table
contingency_table = pd.crosstab(reviews_with_color['color'], reviews_with_color['rating_category'])

# Perform the chi-squared test
chi2, p, dof, ex = chi2_contingency(contingency_table)

print("Contingency Table:")
print(contingency_table)
print("\nChi-squared Test Results:")
print(f"Chi-squared: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")
print(f"Expected Frequencies Table: \n{ex}")

# Interpretation
if p < 0.05:
    print("\nThere is a significant correlation between available color and ratings.")
else:
    print("\nThere is no significant correlation between available color and ratings.")

# Visualize the data

In [None]:
# Average rating by product type
plt.figure(figsize=(12, 6))
sns.barplot(x='material', y='average_rating', data=df, palette='viridis', hue='type', ci=None)
plt.title('Average Rating by Product Type')
plt.xlabel('Product Type')
plt.ylabel('Average Rating')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Plotting a scatter plot for price vs. average rating
plt.figure(figsize=(12, 6))
sns.scatterplot(x='price', y='average_rating', data=df, hue='material', palette='pastel', s=100)
plt.title('Price vs. Average Rating')
plt.xlabel('Price')
plt.ylabel('Average Rating')
plt.legend(title='Material')
plt.show()


# Create a Model
- What type, color would likely get the best reviews?

In [None]:

# Categorize ratings as high (1) or low (0)
df['high_rating'] = df['average_rating'].apply(lambda x: 1 if x >= 4 else 0)

# Select relevant features
features = df[['material', 'color', 'type']]

# One-hot encode categorical variables
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(features).toarray()

# Prepare the target variable
target = df['high_rating']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(encoded_features, target, test_size=0.3, random_state=42)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred, zero_division=0))

In [None]:
# Extract unique colors
colors = df['color'].unique()

# Extract unique materials
materials = df['material'].unique()

print("Possible Colors:")
for color in colors:
    print(f"- {color}")

print("\nPossible Materials:")
for material in materials:
    print(f"- {material}")

In [None]:
# Example: Predicting for a new product
new_product = pd.DataFrame({
    'material': ['Silk'],
    'color': ['Yellow'],
    'type': ['Cat Cave']
})

# One-hot encode the new product's features
encoded_new_product = encoder.transform(new_product).toarray()

# Predict the likelihood of high rating
prediction = model.predict(encoded_new_product)
prediction_proba = model.predict_proba(encoded_new_product)

print(f'Prediction (1 = High Rating, 0 = Low Rating): {prediction[0]}')
print(f'Probability of High Rating: {prediction_proba[0][1]:.2f}')