In [43]:
# Data Downloaded From: https://www.kaggle.com/datasets/mathurinache/1000000-bandcamp-sales

# Importing Numerical Packages
import pandas as pd
import numpy as np


# Pandas Settings
pd.set_option('display.max_row', None)
pd.set_option('display.max_column', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Importing Visualization Packages
import matplotlib.pyplot as plt 
import seaborn as sns 

# For Handling Times
import pytz

# Importing CSV
df = pd.read_csv(r'C:\Users\nickb\Documents\SeattleU\Fall 2022\DataVisualization_BUAN5210\Final Project\1000000-bandcamp-sales.csv')

# Replacing Item Names (can do this as Aliasing in Tableau too)
replace_item_type = {'a': 'Digital Album',
'p': 'Physical Item',
't': 'Digital Track'}
df = df.replace({"item_type": replace_item_type})

# Renamed for Clarity
df.rename(columns = {'country':'Buyer Country',
                     'releases':'Total Artist Releases'}, inplace = True)

# Dropping Duplicate or Irrelevant Columns for Analysis
df.drop('track_album_slug_text', axis=1, inplace=True)
df.drop('country_code', axis=1, inplace=True)
df.drop(['art_id'], axis=1, inplace=True)
df.drop(['_id', 'art_url'], axis=1, inplace=True)
df.drop(['url'], axis=1, inplace=True)

# Dropping Irrelevant Columns or Columns With Too Many Missing Values
df.drop(['package_image_id', 'Total Artist Releases','item_slug'], axis=1, inplace=True)

# Replacing Item Names (can do this as Aliasing in Tableau too)
df['slug_type'].replace('t', 'Digital Track', inplace=True)
df['slug_type'].replace('a', 'Digital & Physical Albums', inplace=True)
df['slug_type'].replace('p', 'Physical Merchandise', inplace=True)

# This is how I found out that b is equal to 'Full Digital Discography'. Same Descriptions more or less. Uncomment to run if you're curious.
# full_dig = df.loc[(df['item_type'] == 'b')]
# full_dig['item_description'].value_counts()

# Imputting Value for 'Full Digital Discography'
df.loc[(df['item_type'] == 'b'), 'item_type'] = 'Full Digital Discography'

# Imputting Value for Physical Album
df.loc[(df['item_type'] != 'Digital Album') & (df['slug_type'] == 'Digital & Physical Albums'), 'item_type'] = 'Physical Album'

# Creating (Hopefully) Clearer Feature Names
df.rename(columns={'amount_paid_fmt': 'Currency Symbol + Paid (In Seller Currency)',
                   'item_price': 'Price (Seller Currency)',
                   'amount_paid': 'Paid (Seller Currency)',
                   'amount_paid_usd': 'Paid (US Dollars)',
                   'slug_type': 'Item Category (Main)',
                   'item_type': 'Item Category (Subcategory)',
                   'amount_over_fmt': 'Paid OVER List Price (Seller Currency)',
                   'artist_name': 'Artist/Label Name'
                   }, inplace=True)

# Asessing if Item listed for free
def no_price (value):
   if value == float('inf'): 
      return 'Yes' # Done as Yes/No since it doesn't auto turn this into a numerical measure
   return 'No'

In [48]:
# Converting Epoch Time to Time
df['utc_date'] = pd.to_datetime(df['utc_date'],unit='s')

# Converting Time to Pacific (Timezone Bandcamp Uses For Bandcamp Fridays)
df['utc_date'] = df['utc_date'].dt.tz_localize('US/Pacific').dt.tz_convert('UTC')

# Renaming Date as it's now Pacific Timezone
df = df.rename({'utc_date': 'Date_Time_PT',}, axis=1) 

# Creating New Column for the Percentage a Buyer Paid Over the Seller's List Price on an Item
df['Percent Paid Over List Price'] = ((df['Paid (Seller Currency)'] - df['Price (Seller Currency)']) / df['Price (Seller Currency)']) * 100

# Creating a New Colunmn to Denote if an item is listed as free.
def no_price (value):
   if value == float('inf'): 
      return 'Yes' # Done as Yes/No since it doesn't auto turn this into a numerical measure
   return 'No'

df['Item Listed As Free'] = df['Percent Paid Over List Price'].map(no_price)

df['Item Listed As Free'].value_counts()

No     879253
Yes    120747
Name: Item Listed As Free, dtype: int64

In [55]:
# Over 100k of items had isna for paid over list price item, so recalculating colum. Uncomment next line to see.
# df['Paid OVER List Price (Seller Currency)'].isna().value_counts()
df['Paid OVER List Price (Seller Currency)'] = (df['Paid (Seller Currency)'] - df['Price (Seller Currency)'])

# Rounding to Two Decimal Places, it seemed like some columns were not binning values correctly
df['Paid OVER List Price (Seller Currency)'] = df['Paid OVER List Price (Seller Currency)'].round(decimals = 2)

# Replace all the NaN Values which based on other columns can tell are 'Full Digital Discography" with that
df['Item Category (Main)'] = df['Item Category (Main)'].replace(np.NaN,'Full Digital Discography')

# Gave More Accurate Name. There are EP's & Singles Shown Too.
df.rename(columns={'album_title': 'Release Title'
                   }, inplace=True)

df['Percent Paid Over List Price'] = df['Percent Paid Over List Price'].round(decimals = 2)

# Replacing Inifinite Values with NaN. This Way in Tableau, It Shows Datatype as Numeric
df['Percent Paid Over List Price'] = df['Percent Paid Over List Price'].replace(np.inf, np.NaN)

df['Bandcamp_Friday?'] = df['Date_Time_PT']
# If on this date, then assign as Bandcamp Friday
df.loc[(df['Date_Time_PT'] > '2020-10-02', "Bandcamp_Friday?")] = "Yes"

# If on this date, then assign as NOT Bandcamp Friday
df.loc[(df['Date_Time_PT'] < '2020-10-02', "Bandcamp_Friday?")] = "No"


df.head(1)

# Only Run When Ready to Export for Tableau
df.to_csv('bandcamp_million_sales_After_Python_Modifications_2.csv', header=True)

Unnamed: 0,Item Category (Subcategory),Date_Time_PT,Buyer Country,Item Category (Main),Currency Symbol + Paid (In Seller Currency),Price (Seller Currency),item_description,Paid (Seller Currency),Artist/Label Name,currency,Release Title,Paid (US Dollars),Paid OVER List Price (Seller Currency),addl_count,Percent Paid Over List Price,Item Listed As Free,Bandcamp_Friday?
0,Digital Album,2020-09-10 05:00:03.517499904+00:00,United Kingdom,Digital & Physical Albums,$9.99,9.99,Live at Vicar Street,9.99,Girl Band,USD,,9.99,0.0,,0.0,No,No
