In [91]:
import matplotlib.pyplot as plt

plt.style.use('seaborn-pastel')
plt.rcParams['figure.figsize'] = (8,5)
plt.rcParams['figure.dpi'] = 500
plt.rcParams['text.color'] = 'black'
plt.rcParams['axes.labelcolor'] = 'black'
plt.rcParams['xtick.color'] = 'black'
plt.rcParams['ytick.color'] = 'black'
plt.rcParams['axes.titlepad'] = 10
plt.rcParams['axes.titleweight'] = 1000
plt.rcParams['axes.labelpad'] = 5
plt.rcParams['font.family'] = 'serif'
plt.rcParams['axes.facecolor'] = 'white'

In [92]:
import pandas as pd
import numpy as np
from db_utils import QueryAll

cols = ["id", "title", "globalId", "categoryId", "categoryName", "url", "location", "shippingType",
                "shippingLocations", "shippingTime", "startTime", "endTime", "returnsAccepted", 
                "conditionId", "listingIsTopRated", "sellerFeedbackScore", "sellerPositivePercent",
                "sellerName", "sellerIsTopRated", "price", "currency", "bids"]

uk_clocks = pd.DataFrame(QueryAll('clock_uk'), columns=cols)
uk_books = pd.DataFrame(QueryAll('book_uk'), columns=cols)
us_clocks = pd.DataFrame(QueryAll('clock_us'), columns=cols)
us_books = pd.DataFrame(QueryAll('book_us'), columns=cols)

uk_clocks = uk_clocks[uk_clocks.bids >= 0]
uk_books = uk_books[uk_books.bids >= 0]
us_clocks = us_clocks[us_clocks.bids >= 0]
us_books = us_books[us_books.bids >= 0]

In [93]:
import matplotlib.ticker as mticker

X0 = uk_clocks.drop_duplicates(subset=['sellerName'])['sellerFeedbackScore']
X1 = uk_books.drop_duplicates(subset=['sellerName'])['sellerFeedbackScore']
X2 = us_clocks.drop_duplicates(subset=['sellerName'])['sellerFeedbackScore']
X3 = us_books.drop_duplicates(subset=['sellerName'])['sellerFeedbackScore']

In [94]:
fig, (axs1, axs2) = plt.subplots(2,2)

axs1[0].hist(np.log(X0+1), bins=25, edgecolor='black')
axs1[0].xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: round(np.exp(x))-1))
axs1[0].set_xlim(0,11)
axs1[0].set_xlabel('Net positive feedback (log scale)')
axs1[0].set_ylabel('Number of sellers')
axs1[0].set_title('Seller feedback distribution (UK clocks)')

axs1[1].hist(np.log(X1+1), bins=25, edgecolor='black')
axs1[1].xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: round(np.exp(x))-1))
axs1[1].set_xlim(0,11)
axs1[1].set_xlabel('Net positive feedback (log scale)')
axs1[1].set_ylabel('Number of sellers')
axs1[1].set_title('Seller feedback distribution (UK books)')

axs2[0].hist(np.log(X2+1), bins=25, edgecolor='black')
axs2[0].xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: round(np.exp(x))-1))
axs2[0].set_xlim(0,11)
axs2[0].set_xlabel('Net positive feedback (log scale)')
axs2[0].set_ylabel('Number of sellers')
axs2[0].set_title('Seller feedback distribution (US clocks)')

axs2[1].hist(np.log(X3+1), bins=25, edgecolor='black')
axs2[1].xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: round(np.exp(x))-1))
axs2[1].set_xlim(0,11)
axs2[1].set_xlabel('Net positive feedback (log scale)')
axs2[1].set_ylabel('Number of sellers')
axs2[1].set_title('Seller feedback distribution (US books)')

plt.tight_layout()
fig.savefig('feedback1.png', dpi=fig.dpi, bbox_inches='tight')
plt.close(fig)

In [95]:
fig, ax = plt.subplots()

ax.boxplot([np.log(X+1) for X in [X0,X1,X2,X3]], whis=(5, 95), patch_artist=True, flierprops={'markersize':3},
           boxprops={'facecolor':(146/255,198/255,1)}, medianprops={'color':'black'})
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: round(np.exp(x))-1))
ax.set_xticklabels(['UK Clocks', 'UK Books', 'US Clocks', 'US Books'])
ax.set_ylabel('Seller feedback score (log scale)')
ax.set_title('Seller feedback distributions')

plt.tight_layout()
fig.savefig('feedback2.png', dpi=fig.dpi, bbox_inches='tight')
plt.close(fig)

In [96]:
ET0 = uk_clocks['endTime'].str.slice(10,13).astype(int)
ET1 = uk_books['endTime'].str.slice(10,13).astype(int)
ET2 = us_clocks['endTime'].str.slice(10,13).astype(int)
ET3 = us_books['endTime'].str.slice(10,13).astype(int)

fig, (axs1, axs2) = plt.subplots(2,2)

axs1[0].hist(ET0, bins=range(25), edgecolor='black', color='orange')
axs1[0].set_xticks([0,4,8,12,16,20,24])
axs1[0].set_xticklabels(['12AM', '4AM', '8AM', '12PM', '4PM', '8PM', '12AM'])
axs1[0].set_xlim(0,24)
axs1[0].set_xlabel('Time (GMT)')
axs1[0].set_ylabel('Number of listings')
axs1[0].set_title('Distribution of end times (UK Clocks)')

axs1[1].hist(ET1, bins=range(25), edgecolor='black', color='orange')
axs1[1].set_xticks([0,4,8,12,16,20,24])
axs1[1].set_xticklabels(['12AM', '4AM', '8AM', '12PM', '4PM', '8PM', '12AM'])
axs1[1].set_xlim(0,24)
axs1[1].set_xlabel('Time (GMT)')
axs1[1].set_ylabel('Number of listings')
axs1[1].set_title('Distribution of end times (UK Books)')

axs2[0].hist(ET2-8, bins=range(-8,17), edgecolor='black', color='orange')
axs2[0].set_xticks([-8,-4,0,4,8,12,16])
axs2[0].set_xticklabels(['12AM', '4AM', '8AM', '12PM', '4PM', '8PM', '12AM'])
axs2[0].set_xlim(-8,16)
axs2[0].set_xlabel('Time (PST)')
axs2[0].set_ylabel('Number of listings')
axs2[0].set_title('Distribution of end times (US Clocks)')

axs2[1].hist(ET3-8, bins=range(-8,17), edgecolor='black', color='orange')
axs2[1].set_xticks([-8,-4,0,4,8,12,16])
axs2[1].set_xticklabels(['12AM', '4AM', '8AM', '12PM', '4PM', '8PM', '12AM'])
axs2[1].set_xlim(-8,16)
axs2[1].set_xlabel('Time (PST)')
axs2[1].set_ylabel('Number of listings')
axs2[1].set_title('Distribution of end times (US Books)')

plt.tight_layout()
fig.savefig('endtime.png', dpi=fig.dpi, bbox_inches='tight')
plt.close(fig)

In [97]:
CI0 = dict(uk_clocks[uk_clocks.conditionId != -1].conditionId.value_counts())
CI1 = dict(us_clocks[us_clocks.conditionId != -1].conditionId.value_counts())
CI2 = dict(uk_books[uk_books.conditionId != -1].conditionId.value_counts())
CI3 = dict(us_books[us_books.conditionId != -1].conditionId.value_counts())

labels = set(list(CI0.keys())) | set(list(CI1.keys())) | set(list(CI2.keys())) | set(list(CI3.keys()))
labels = sorted([int(l) for l in list(labels)])
C0 = [CI0.get(label,0) for label in labels]
C1 = [CI1.get(label,0) for label in labels]
C2 = [CI2.get(label,0) for label in labels]
C3 = [CI3.get(label,0) for label in labels]
legend = ['UK Clocks', 'US Clocks','UK Books', 'US Books']
colors = ['blue', 'green', 'lightblue', 'lightgreen']

fig, ax = plt.subplots()

for i, value in enumerate([C0,C1,C2,C3]):
    position = np.arange(0,9) -0.3 + i*0.2
    ax.bar(position, value, width=0.2, label=legend[i], color=colors[i])


ax.xaxis.set_ticks(range(9))
ax.set_xticklabels(['New', 'New \n others', 'Seller \n refurbished', 'Like \n new', 'Used', 
                    'Very \n good', 'Good', 'Acceptable', 'Parts \n not working'])
ax.xaxis.set_tick_params(labelsize=8)
ax.legend()
ax.set_xlabel('Listing condition')
ax.set_ylabel('Number of listings')
ax.set_title('Distribution of listing conditions')

plt.tight_layout()
fig.savefig('condition.png', dpi=fig.dpi, bbox_inches='tight')
plt.close(fig)

In [98]:
from wordcloud import WordCloud, STOPWORDS 

uk_words = ' '.join(str(x) for x in list(uk_clocks["title"]))
us_words = ' '.join(str(x) for x in list(us_clocks["title"]))

stop_words = STOPWORDS.update(["clock", "wall"])
wordcloud = WordCloud(width=1600, height=800, stopwords = stop_words).generate(' '.join([uk_words, us_words]))

wordcloud.to_file('cloud-clock.png')

<wordcloud.wordcloud.WordCloud at 0x7fc6841ccd00>

In [99]:
uk_words = ' '.join(str(x) for x in list(uk_books["title"]))
us_words = ' '.join(str(x) for x in list(us_books["title"]))

stop_words = STOPWORDS.update(["harry", "potter", "book", "J", "rowling", "books"])
wordcloud = WordCloud(width=1600, height=800, stopwords = stop_words).generate(' '.join([uk_words, us_words]))

wordcloud.to_file('cloud-book.png')

<wordcloud.wordcloud.WordCloud at 0x7fc6829fd910>

In [127]:
B0 = dict(uk_clocks.bids.value_counts())
B1 = dict(uk_books.bids.value_counts())
B2 = dict(us_clocks.bids.value_counts())
B3 = dict(us_books.bids.value_counts())

fig, (axs1, axs2) = plt.subplots(2,2)

axs1[0].bar(list(range(0,max(list(B0.keys())))),np.log([B0.get(i, 0)+1 for i in range(0,max(list(B0.keys())))]),color='green')
axs1[0].set_xlabel('Number of bids')
axs1[0].set_ylabel('Number of listings \n (log scale)')
axs1[0].set_title('Bid distribution (UK clocks)')
axs1[0].yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: round(np.exp(x))-1))
axs1[0].set_xlim(-1,46)

axs1[1].bar(list(range(0,max(list(B1.keys())))),np.log([B1.get(i, 0)+1 for i in range(0,max(list(B1.keys())))]),color='green')
axs1[1].set_xlabel('Number of bids')
axs1[1].set_ylabel('Number of listings \n (log scale)')
axs1[1].set_title('Bid distribution (UK books)')
axs1[1].yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: round(np.exp(x))-1))
axs1[1].set_xlim(-1,46)

axs2[0].bar(list(range(0,max(list(B2.keys())))),np.log([B2.get(i, 0)+1 for i in range(0,max(list(B2.keys())))]),color='green')
axs2[0].set_xlabel('Number of bids')
axs2[0].set_ylabel('Number of listings \n (log scale)')
axs2[0].set_title('Bid distribution (US clocks)')
axs2[0].yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: round(np.exp(x))-1))
axs2[0].set_xlim(-1,46)

axs2[1].bar(list(range(0,max(list(B3.keys())))),np.log([B3.get(i, 0)+1 for i in range(0,max(list(B3.keys())))]),color='green')
axs2[1].set_xlabel('Number of bids')
axs2[1].set_ylabel('Number of listings \n (log scale)')
axs2[1].set_title('Bid distribution (US books)')
axs2[1].yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: round(np.exp(x))-1))
axs2[1].set_xlim(-1,46)

plt.tight_layout()
fig.savefig('bid.png', dpi=fig.dpi, bbox_inches='tight')
plt.close(fig)

In [101]:
uk_clocks_ = pd.read_csv('uk_clocks.csv')
uk_books_ = pd.read_csv('uk_books.csv')
us_clocks_ = pd.read_csv('us_clocks.csv')
us_books_ = pd.read_csv('us_books.csv')

P0 = uk_clocks_.price
P1 = uk_books_.price
P2 = us_clocks_.price
P3 = us_books_.price

In [120]:
fig, (axs1, axs2) = plt.subplots(2,2)

axs1[0].hist(np.log(P0+1), bins=25, edgecolor='black', color='coral')
axs1[0].xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: round(np.exp(x))-1))
axs1[0].set_xlim(0,6.5)
axs1[0].set_xlabel('Price of sold listing (log scale)')
axs1[0].set_ylabel('Number of listings')
axs1[0].set_title('Price distribution (UK clocks)')

axs1[1].hist(np.log(P1+1), bins=25, edgecolor='black', color='coral')
axs1[1].xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: round(np.exp(x))-1))
axs1[1].set_xlim(0,6.5)
axs1[1].set_xlabel('Price of sold listing (log scale)')
axs1[1].set_ylabel('Number of listings')
axs1[1].set_title('Price distribution (UK books)')

axs2[0].hist(np.log(P2+1), bins=25, edgecolor='black', color='coral')
axs2[0].xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: round(np.exp(x))-1))
axs2[0].set_xlim(0,6.5)
axs2[0].set_xlabel('Price of sold listing (log scale)')
axs2[0].set_ylabel('Number of listings')
axs2[0].set_title('Price distribution (US clocks)')

axs2[1].hist(np.log(P3+1), bins=25, edgecolor='black', color='coral')
axs2[1].xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: round(np.exp(x))-1))
axs2[1].set_xlim(0,6.5)
axs2[1].set_xlabel('Price of sold listing (log scale)')
axs2[1].set_ylabel('Number of listings')
axs2[1].set_title('Price distribution (US books)')

plt.tight_layout()
fig.savefig('price.png', dpi=fig.dpi, bbox_inches='tight')
plt.close(fig)

In [135]:
uk_clocks.describe()

Unnamed: 0,id,categoryId,shippingTime,returnsAccepted,conditionId,listingIsTopRated,sellerFeedbackScore,sellerPositivePercent,sellerIsTopRated,bids
count,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0
mean,246413400000.0,112733.061288,-1.0,-1.0,2495.018352,0.036704,3066.76108,98.444252,0.036704,0.760734
std,81214120000.0,113193.449713,0.0,0.0,1440.834771,0.188066,18552.584533,9.407591,0.188066,2.939858
min,115582300000.0,12.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0
25%,175479400000.0,20561.0,-1.0,-1.0,1000.0,0.0,316.0,99.5,0.0,0.0
50%,255802300000.0,20561.0,-1.0,-1.0,3000.0,0.0,754.0,100.0,0.0,0.0
75%,314206700000.0,261608.0,-1.0,-1.0,3000.0,0.0,1634.25,100.0,0.0,0.0
max,403980400000.0,262408.0,-1.0,-1.0,7000.0,1.0,470260.0,100.0,1.0,39.0


In [137]:
len(uk_clocks[uk_clocks.bids == 0])/len(uk_clocks)

0.8469529085872576

In [140]:
len(uk_books[uk_books.bids == 0])/len(uk_books)

0.8673163418290855

In [139]:
len(us_clocks[us_clocks.bids == 0])/len(us_clocks)

0.8153787590407309

In [138]:
len(us_books[us_books.bids == 0])/len(us_books)

0.880690737833595