**Import necessary libraries**

In [130]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [131]:
df_name = "EtsySoldOrderItems2020"

**Read csv files**

In [132]:
df = pd.read_csv("../../Datasets/Bouletta/"+df_name+".csv")

Columns of dataframe

In [133]:
df.columns

Index(['Sale Date', 'Item Name', 'Buyer', 'Quantity', 'Price', 'Coupon Code',
       'Coupon Details', 'Discount Amount', 'Shipping Discount',
       'Order Shipping', 'Order Sales Tax', 'Item Total', 'Currency',
       'Transaction ID', 'Listing ID', 'Date Paid', 'Date Shipped',
       'Ship Name', 'Ship Address1', 'Ship Address2', 'Ship City',
       'Ship State', 'Ship Zipcode', 'Ship Country', 'Order ID', 'Variations',
       'Order Type', 'Listings Type', 'Payment Type', 'InPerson Discount',
       'InPerson Location', 'VAT Paid by Buyer', 'SKU'],
      dtype='object')

In [134]:
df = df.rename(columns={"Item Name":"Item Model"})

In [135]:
df.head(2)

Unnamed: 0,Sale Date,Item Model,Buyer,Quantity,Price,Coupon Code,Coupon Details,Discount Amount,Shipping Discount,Order Shipping,...,Ship Country,Order ID,Variations,Order Type,Listings Type,Payment Type,InPerson Discount,InPerson Location,VAT Paid by Buyer,SKU
0,12/31/20,Genuine Gray Leather Samsung Galaxy Note 20 Pr...,Shanon Browne (wj1ol9t2d47zhqco),1,44.67,25OFFDECEMBER,25OFFDECEMBER - % off,11.17,0.0,0.0,...,United States,1908770705,Personalization Request:Add Laser Engraving,online,listing,online_cc,,,0,BO-01-FXCP-TN18E-0NT20-00
1,12/31/20,Genuine Leather Apple iPhone 12 & iPhone 12 Pr...,Kevin Butterfield (kevinbutterfield72),1,52.0,25OFFDECEMBER,25OFFDECEMBER - % off,13.0,3.21,0.0,...,United States,1900438126,Personalization Request:No Personalization,online,listing,online_cc,,,0,BO-01-FXC0-RS02E-IP121-00


Check null values

In [136]:
df.isna().sum()

Sale Date              0
Item Model             0
Buyer                124
Quantity               0
Price                  0
Coupon Code           21
Coupon Details        21
Discount Amount        0
Shipping Discount      0
Order Shipping         0
Order Sales Tax        0
Item Total             0
Currency               0
Transaction ID         0
Listing ID             0
Date Paid              0
Date Shipped           0
Ship Name              0
Ship Address1          0
Ship Address2        562
Ship City              0
Ship State            73
Ship Zipcode           4
Ship Country           0
Order ID               0
Variations            27
Order Type             0
Listings Type          0
Payment Type           0
InPerson Discount    702
InPerson Location    702
VAT Paid by Buyer      0
SKU                    0
dtype: int64

In [137]:
df.drop(columns=['Buyer','Coupon Details','Shipping Discount','Order Shipping',\
                      'Order Sales Tax','Item Total', 'Currency','Transaction ID', 'Listing ID',\
                      'Date Paid', 'Date Shipped','Ship Name', 'Ship Address1', 'Ship Address2',\
                      'Ship Zipcode', 'Variations','Order Type', 'Listings Type',\
                      'Payment Type', 'InPerson Discount','InPerson Location', 'VAT Paid by Buyer',\
                      'SKU','Ship City','Ship State'],inplace = True)

In [138]:
df.dropna(inplace=True)

Check duplicates

In [139]:
df.duplicated().all().sum()

0

Map seasons

In [140]:
def map_seasons(dataframe,month,season,sale_date):
    dataframe[season] = ""  # Add a new column named "Season" to the dataframe
    for i in dataframe[sale_date]:
        nums = str.split(i,sep="/")
        date = ""
        if nums[0] in ["12","01","02"]:
            date = "Winter"
        elif nums[0] in ["03","04","05"]:
            date = "Spring"
        elif nums[0] in ["06","07","08"]:
            date = "Summer"
        elif nums[0] in ["09","10","11"]:
            date = "Fall"
        dataframe.loc[dataframe[sale_date] == i, season] = date
        dataframe.loc[dataframe[sale_date] == i, month] = nums[0]
    return dataframe   

In [141]:
df = map_seasons(df,"Month","Season","Sale Date")

In [142]:
def map_years(dataframe, years,sale_date):
    dataframe[years] = ""
    for i in dataframe[sale_date]:
        date = datetime.strptime(i, "%m/%d/%y")
        year = date.year
        dataframe.loc[dataframe[sale_date] == i, years] = year
    return dataframe

In [143]:
df = map_years(df,"Year","Sale Date")

In [144]:
df.head()

Unnamed: 0,Sale Date,Item Model,Quantity,Price,Coupon Code,Discount Amount,Ship Country,Order ID,Season,Month,Year
0,12/31/20,Genuine Gray Leather Samsung Galaxy Note 20 Pr...,1,44.67,25OFFDECEMBER,11.17,United States,1908770705,Winter,12,2020
1,12/31/20,Genuine Leather Apple iPhone 12 & iPhone 12 Pr...,1,52.0,25OFFDECEMBER,13.0,United States,1900438126,Winter,12,2020
2,12/30/20,Apple iPhone 11 Pro Max (6.5'') Handmade Full ...,1,55.33,25OFFDECEMBER,13.83,United States,1899380664,Winter,12,2020
3,12/30/20,"Genuine Leather Apple iPhone 12 PRO MAX (6.7"")...",1,62.0,25OFFDECEMBER,15.5,United States,1899347518,Winter,12,2020
4,12/29/20,Handmade Genuine Leather Apple Watch Band 44mm...,1,53.0,25OFFDECEMBER,13.25,United States,1906991839,Winter,12,2020


In [145]:
seasons_dict = {'Winter': 0, 'Spring': 1, 'Summer': 2, 'Fall': 3}
df['Season'] = df['Season'].replace(seasons_dict)

In [146]:
df['Sale Date'] = pd.to_datetime(df['Sale Date'], format='%m/%d/%y')
df['Day Of Year'] = df['Sale Date'].apply(lambda x: x.timetuple().tm_yday)

In [147]:
countries = df['Ship Country'].unique()
countries

array(['United States', 'Switzerland', 'Germany', 'Canada',
       'United Kingdom', 'Australia', 'The Netherlands', 'Luxembourg',
       'Sweden', 'France', 'Latvia', 'Israel', 'Greece', 'Italy',
       'Ireland', 'New Zealand', 'Finland', 'South Africa', 'Reunion',
       'Denmark', 'Austria', 'United Arab Emirates', 'Hong Kong',
       'Norway', 'Malaysia', 'Taiwan', 'Estonia', 'Philippines'],
      dtype=object)

In [148]:
country_map = {'United States':1, 'Canada':2, 'United Kingdom':3, 'Australia':4,
       'Switzerland':5, 'Sweden':6, 'Israel':7, 'Italy':8, 'Ireland':9,
       'New Zealand':10, 'United Arab Emirates':11, 'Hong Kong':12, 'Malaysia':13,
       'The Netherlands':14, 'Taiwan':15, 'Germany':16, 'Philippines':17,"Austria":18,
              "South Korea":19,"France":20,"Hungary":21,"Singapore":22,"Finland":23,
              "Belgium":24,"Luxembourg":25,"Puerto Rico":26,"Sri Lanka":27,"South Africa":28,
              "Latvia":29,"Greece":30,"Reunion":31,"Denmark":32,"Norway":33,"Estonia":34,
              "Spain":35,"Brazil":36,"India":37,"Japan":38,"Indonesia":39,"Mexico":40,
              "Russia":41}

In [149]:
def map_country(country):
    x = False
    for i, number in country_map.items():
        if i in country:
            x= True
            return number
    if x == False:
        print(country)
df["Ship Country"] = df["Ship Country"].apply(map_country)

In [150]:
df.drop(columns=['Sale Date','Day Of Year'],inplace = True)

In [151]:
color_map = {"Brown":1,"Gray":2,"Black":3,"Navy Blue":4,"Blue":5,"Yellow":6,"Red":7,"Pink":8,
            "Multicolered":9,"Green":10,"Beige":11,"Purple":12,"Leopar":13,"Multicolored":9,
            "Grey":2,"Multicolor":9,"Rose Gold":14,"Rainbow":15,"White":16,"Genuine Leather":0,\
            "Leather":-1}

In [152]:
df.columns

Index(['Item Model', 'Quantity', 'Price', 'Coupon Code', 'Discount Amount',
       'Ship Country', 'Order ID', 'Season', 'Month', 'Year'],
      dtype='object')

In [153]:
def map_color(item_name):
    x = False
    for model, number in color_map.items():
        if model in item_name:
            x= True
            return number
    if x == False:
        if "Genuine" in item_name:
            return 0
        elif "Leather" in item_name:
            return -1
        else:
            print(item_name)
df["Color"] = df["Item Model"].apply(map_color)

Samsung Galaxy S10 Plus Magnetic Detachable Phone Case, Wallet Phone Case Samsung S10 Plus, Samsung Phone  S10 Plus Cover by Bouletta


In [154]:
model_map = {"iPhone 12": 1, "iPhone 12 Pro": 2, "Samsung Galaxy S21": 3,"Galaxy Note 20":4,\
             "iPhone 11 Pro Max": 5,"iPhone 12 PRO MAX":6,"Watch Band":7,"iPhone Mini 12":8,\
            "Galaxy Note 20 ULTRA":9,"Galaxy S20 Ultra":10,"iPhone 13 Series":11,
            "Galaxy Note 10 Plus":12,"iPhone 13 Pro":13,"Dog":14,"AirPods":14,"Desk Mat":15,
            "Galaxy S20 Plus":16,"iPhone SE":17,"Samsung Galaxy S20":18,"Apple iPhone XS MAX":19,
            "Apple iPhone XR":20,"Apple iPhone X/XS":21,"iPhone 11":22,"Samsung S10":23,
            "Card Holder":24,"Galaxy N10":25,"iPhone 7/8":26,"Women's Wallet":27,
            "Custom order":28,"Organizer":29,"iPhone 14 Series":30,"Galaxy S22 Series":31,"AirTag":32,
            "Mouse Pad":33,"Galaxy S22":34,"Makeup Purse":35,"Huawei Mate 20 Pro":36,
             "Samsung Galaxy S10":37,"Huawei Mate 20 Lite":38,"Samsung Galaxy S8 Plus":39,
             "Samsung Galaxy S10 Plus":40,"Samsung Note 8":41,"Huawei P20 Pro":42,"Samsung Note 10":43,
            "Samsung Galaxy S8":44,"Galaxy S9 Plus":45,"Galaxy Note 9":46,"Galaxy S9":47,
            "iPhone XS MAX":48,"Samsung Note 9":46,"iPhone X/iPhone XS":49,"iPhone X / iPhone XS":49,
            "Apple Watch":7,"iPhone X/XS":49,"iPhone 7 / 8":26,"Galaxy Note 10+":51,"iPhone X":52} 

In [155]:
def map_model(item_name):
    x = False
    for model, number in model_map.items():
        if model in item_name:
            x= True
            return number
    if x == False:
        print(item_name)
df["Item Model"] = df["Item Model"].apply(map_model)

In [156]:
df.sample(2)

Unnamed: 0,Item Model,Quantity,Price,Coupon Code,Discount Amount,Ship Country,Order ID,Season,Month,Year,Color
324,7,1,49.9,SEPTEMBER20OFF,9.98,1,1748315544,2,8,2020,1.0
563,7,1,49.9,MAY15OFF,7.49,2,1641197329,1,5,2020,1.0


In [157]:
df.describe()

Unnamed: 0,Item Model,Quantity,Price,Discount Amount,Ship Country,Order ID,Season,Color
count,681.0,681.0,681.0,681.0,681.0,681.0,681.0,680.0
mean,16.753304,1.005874,60.315301,13.180749,3.099853,1739061000.0,1.729809,2.927941
std,12.762569,0.076471,18.859632,7.727264,5.365085,98226140.0,1.118221,3.330023
min,1.0,1.0,25.33,0.0,1.0,1557283000.0,0.0,-1.0
25%,5.0,1.0,49.0,8.76,1.0,1655229000.0,1.0,1.0
50%,17.0,1.0,56.4,11.33,1.0,1739930000.0,2.0,1.0
75%,23.0,1.0,65.4,14.67,2.0,1824392000.0,3.0,5.0
max,52.0,2.0,146.25,83.46,34.0,1908771000.0,3.0,16.0


In [158]:
df['Coupon Code'] = df['Coupon Code'].map(lambda x: 0 if pd.isnull(x) else 1)

In [159]:
df["Price"] = df["Price"] - df["Discount Amount"]

In [160]:
df.drop('Discount Amount', axis=1,inplace=True)

In [161]:
df.to_csv("../../Datasets/Bouletta/"+df_name+"Modified.csv", index=False)

In [162]:
df_2020 = pd.read_csv("../../Datasets/Bouletta/EtsySoldOrderItems2020Modified.csv")
df_2021 = pd.read_csv("../../Datasets/Bouletta/EtsySoldOrderItems2021Modified.csv")
df_2022 = pd.read_csv("../../Datasets/Bouletta/EtsySoldOrderItems2022Modified.csv")

In [163]:
merged_df = pd.concat([df_2020, df_2021, df_2022])

In [164]:
merged_df_csv = merged_df.drop(columns=["Order ID"])

In [165]:
merged_df.describe()

Unnamed: 0,Item Model,Quantity,Price,Coupon Code,Ship Country,Order ID,Season,Month,Year,Color
count,2227.0,2227.0,2227.0,2227.0,2227.0,2227.0,2227.0,2227.0,2227.0,2224.0
mean,12.615177,1.003592,53.196744,1.0,2.750786,2106016000.0,1.550516,6.957791,2020.957342,2.246403
std,10.496164,0.059841,23.083331,0.0,5.052318,308571500.0,1.123359,3.439637,0.753234,3.756714
min,1.0,1.0,-23.82,1.0,1.0,1557283000.0,0.0,1.0,2020.0,-1.0
25%,4.0,1.0,39.005,1.0,1.0,1841292000.0,1.0,4.0,2020.0,0.0
50%,11.0,1.0,48.25,1.0,1.0,2120118000.0,2.0,7.0,2021.0,1.0
75%,18.0,1.0,57.87,1.0,1.0,2345903000.0,3.0,10.0,2022.0,3.0
max,52.0,2.0,219.0,1.0,41.0,2743856000.0,3.0,12.0,2022.0,16.0


In [166]:
merged_df.sample()

Unnamed: 0,Item Model,Quantity,Price,Coupon Code,Ship Country,Order ID,Season,Month,Year,Color
523,3,1,56.5,1,1,2364813044,0,1,2022,1.0


In [167]:
merged_df_csv.to_csv("../../Datasets/Bouletta/"+"EtsySoldOrderItems2020-2021-2022.csv", index=False)

In [168]:
merged_df.isna().sum()

Item Model      0
Quantity        0
Price           0
Coupon Code     0
Ship Country    0
Order ID        0
Season          0
Month           0
Year            0
Color           3
dtype: int64

In [169]:
merged_df.duplicated().all().sum()

0

In [170]:
review_df = pd.read_json('reviews.json')

In [171]:
review_df.head()

Unnamed: 0,reviewer,date_reviewed,star_rating,message,order_id
0,Katie,01/24/2023,5,"Bought for my bf, arrived earlier than expecte...",2698975905
1,Andrea,01/02/2023,4,Wir haben den Artikel in schwarz und blau best...,2735309118
2,Andrea,01/02/2023,4,Die Handytaschen sind sehr hochwertig verarbei...,2735309118
3,Lerissa,12/30/2022,4,,2732670357
4,Lerissa,12/30/2022,4,,2732670357


In [172]:
review_df['message'] = np.where(review_df['message'] == '', 0, 1)

In [173]:
etsy = pd.merge(merged_df, review_df, left_on='Order ID',right_on='order_id',how="left")

In [174]:
etsy.drop(columns=['date_reviewed',"Order ID","reviewer","order_id","date_reviewed"],inplace = True)

In [175]:
etsy["star_rating"].fillna(0,inplace=True)

In [176]:
etsy["message"].fillna(0,inplace=True)

In [177]:
etsy.head()

Unnamed: 0,Item Model,Quantity,Price,Coupon Code,Ship Country,Season,Month,Year,Color,star_rating,message
0,4,1,33.5,1,1,0,12,2020,2.0,0.0,0.0
1,1,1,39.0,1,1,0,12,2020,1.0,0.0,0.0
2,5,1,41.5,1,1,0,12,2020,1.0,2.0,1.0
3,1,1,46.5,1,1,0,12,2020,1.0,0.0,0.0
4,7,1,39.75,1,1,0,12,2020,0.0,0.0,0.0


In [178]:
etsy.describe()

Unnamed: 0,Item Model,Quantity,Price,Coupon Code,Ship Country,Season,Month,Year,Color,star_rating,message
count,2253.0,2253.0,2253.0,2253.0,2253.0,2253.0,2253.0,2253.0,2250.0,2253.0,2253.0
mean,12.636485,1.003551,53.075348,1.0,2.775854,1.551265,6.960941,2020.960497,2.250222,1.112295,0.172659
std,10.493491,0.059496,23.228542,0.0,5.097618,1.121272,3.435434,0.754349,3.759211,2.034405,0.378036
min,1.0,1.0,-23.82,1.0,1.0,0.0,1.0,2020.0,-1.0,0.0,0.0
25%,4.0,1.0,39.0,1.0,1.0,1.0,4.0,2020.0,0.0,0.0,0.0
50%,11.0,1.0,48.25,1.0,1.0,2.0,7.0,2021.0,1.0,0.0,0.0
75%,18.0,1.0,57.87,1.0,1.0,3.0,10.0,2022.0,3.0,0.0,0.0
max,52.0,2.0,219.0,1.0,41.0,3.0,12.0,2022.0,16.0,5.0,1.0


In [179]:
etsy.to_csv("../../Datasets/Bouletta/EtsySoldOrderItemsWithReviews2020-2021-2022.csv", index=False)