In [26]:
import numpy as np
import pandas as pd
import plotly.express as px
from scipy.stats import mannwhitneyu

# Import Data

In [10]:
sample_data_frame = pd.read_json('../data/sample_dc_data.json')

100000

# Data Processing

In [3]:
picture_data_frame = pd.DataFrame(sample_data_frame, columns=['rating', 'pics'])
picture_data_frame['pics'] = picture_data_frame['pics'].apply(lambda x: len(x) if x is not None else 0)
picture_data_frame.rename(columns={'rating': 'Rating', 'pics': 'Num Pictures'}, inplace=True)
picture_data_frame.head()

Unnamed: 0,Rating,Num Pictures
0,5,0
1,4,0
2,4,0
3,4,0
4,5,0


In [9]:
# Calculate total count of each rating
rating_counts = picture_data_frame['Rating'].value_counts().sort_index()

# Calculate average number of pictures for each rating
avg_pictures_per_rating = picture_data_frame.groupby('Rating')['Num Pictures'].mean()

summary_data_frame = pd.DataFrame({'Count': rating_counts, 'Avg. Num Pictures': avg_pictures_per_rating}).reset_index().rename(columns={'index': 'Rating'})

In [5]:
# Plot total count
fig1 = px.bar(summary_data_frame, x='Rating', y='Count', title='Total Count of Each Rating')
fig1.update_layout(xaxis_title='Rating', yaxis_title='Total Count', uniformtext_mode='hide')
fig1.show()

In [32]:
# Plot avg. num pictures
fig2 = px.bar(summary_data_frame, x='Rating', y='Avg. Num Pictures', title='Average Number of Pictures per Rating')
fig2.update_layout(xaxis_title='Rating', yaxis_title='Average Number of Pictures', uniformtext_mode='hide')
fig2.update_traces(marker_color='#007AFF')
fig2.write_image('../imgs/avg_pictures_per_rating.png', scale=3)
fig2.show()

In [7]:
summary_data_frame

Unnamed: 0,Rating,Count,Avg. Num Pictures
0,1,2924,0.067031
1,2,3275,0.079695
2,3,11440,0.101661
3,4,27925,0.144172
4,5,54436,0.206132


In [15]:
# Determine the percentage of reviews without pictures
num_rows_with_zero = len(picture_data_frame[picture_data_frame['Num Pictures'] == 0])
total_rows = len(picture_data_frame)
percentage_with_zero = (num_rows_with_zero / total_rows) * 100

print(f'Percentage of reviews without pictures: {percentage_with_zero:.2f}%')

Percentage of reviews without pictures: 94.68%


In [19]:
picture_data_frame['Has Pictures'] = (picture_data_frame['Num Pictures'] > 0)
picture_data_frame.head()

# Calculate the average rating for reviews with pictures
average_with_pictures = picture_data_frame[picture_data_frame['Has Pictures']]['Rating'].mean()

# Calculate the average rating for reviews without pictures
average_without_pictures = picture_data_frame[~picture_data_frame['Has Pictures']]['Rating'].mean()

print(f'Average rating for reviews with pictures: {average_with_pictures:.2f}')
print(f'Average rating for reviews without pictures: {average_without_pictures:.2f}')

Average rating for reviews with pictures: 4.49
Average rating for reviews without pictures: 4.26


In [25]:
# Filter the ratings for reviews with pictures
ratings_with_pictures = picture_data_frame[picture_data_frame['Has Pictures'] == 1]['Rating']

# Filter the ratings for reviews without pictures
ratings_without_pictures = picture_data_frame[picture_data_frame['Has Pictures'] == 0]['Rating']

# Perform a Mann-Whitney U Test
u_statistic, p_value = mannwhitneyu(ratings_with_pictures, ratings_without_pictures)

print(f'U-statistic: {u_statistic}')
print(f'P-value: {p_value}')

# Interpret the results
if p_value < 0.05:
    print('There is a statistically significant difference in the distribution of ratings for reviews with and without pictures.')
else:
    print('There is no statistically significant difference in the distribution of ratings for reviews with and without pictures.')

U-statistic: 283019634.5
P-value: 1.867421394562336e-64
There is a statistically significant difference in the distribution of ratings for reviews with and without pictures.


In [30]:
def cliffs_delta(x, y):
    n_x = len(x)
    n_y = len(y)
    n_total = n_x * n_y
    x_bigger_than_y = np.sum(np.greater.outer(x, y))
    y_bigger_than_x = np.sum(np.less.outer(x, y))
    return (x_bigger_than_y - y_bigger_than_x) / n_total

def interpret_cliffs_delta(delta):
    abs_delta = abs(delta)
    if abs_delta < 0.147:
        return 'small'
    elif abs_delta < 0.33:
        return 'medium'
    else:
        return 'large'

def cliffs_delta_subset(data1, data2):
    # Ensure data1 is the smaller group
    if len(data2) < len(data1):
        data1, data2 = data2, data1
    
    # Randomly select a subset from data2 equal in size to data1
    subset_data2 = np.random.choice(data2, size=len(data1), replace=False)
    
    # Compute Cliff's Delta for the subset and data1
    delta = cliffs_delta(data1, subset_data2)
    return delta

# Assuming ratings_with_pictures and ratings_without_pictures are defined
subset_ratings_with_pictures = np.random.choice(ratings_with_pictures, size=10000, replace=True)
subset_ratings_without_pictures = np.random.choice(ratings_without_pictures, size=10000, replace=True)

# Compute Cliff's Delta using subset
delta = cliffs_delta_subset(subset_ratings_with_pictures, subset_ratings_without_pictures)

print(f'Cliff\'s Delta: {delta:.3f}')

# Interpret the magnitude of the difference
effect_size = interpret_cliffs_delta(delta)
direction = 'larger' if delta > 0 else 'smaller'

print(f'The effect size is {effect_size}, and the ratings with pictures are generally {direction} than those without pictures.')


Cliff's Delta: 0.115
The effect size is small, and the ratings with pictures are generally larger than those without pictures.
