In [50]:
import pandas as pd
import chardet
from bokeh.layouts import row, column
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, Slider, Select, HoverTool, Div, Tabs, TabPanel
from bokeh.io import output_notebook

def KPI(text:str):
    return Div(text=f"""
    <div style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">
    <h1>{text}</h1>
    </div>
    """)

In [14]:
def detect_encoding(file_path: str) -> str:
    with open(file_path, 'rb') as file:
        detector = chardet.universaldetector.UniversalDetector()
        for line in file:
            detector.feed(line)
            if detector.done:
                break
        detector.close()
    return detector.result['encoding']

In [15]:
product_id = "com.vansteinengroentjes.apps.ddfive"

In [16]:
import glob
data_path = 'data/'
csv_files = glob.glob(data_path + '*.csv')

# Initialize an empty list to store dataframes
dataframes = []

reviews_df = pd.DataFrame()
sales_df = pd.DataFrame()
crashes_df = pd.DataFrame()
ratings_overview_df = pd.DataFrame()
ratings_country_df = pd.DataFrame()

# encodings = ['utf-8', 'utf-16 LE']

# Loop through the CSV files and read them into dataframes
for file in csv_files:
    # print(file)
    encoding = detect_encoding(file) 
    # for encoding in encodings:
    try:
        df = pd.read_csv(file, encoding=encoding)
        if "reviews" in file:
            # print(file + " " + encoding + " " + str(df.shape))
            reviews_df = pd.concat([reviews_df, df], ignore_index=True)
        elif "sales" in file:
            # print(file + " " + encoding + " " + str(df.shape))

            df.columns = df.columns.str.strip().str.lower()
            # rename colums
            if "order number" in df.columns: 
                df.rename(columns={"order number": "description"}, inplace=True)
            
            if "order charged date" in df.columns: 
                df.rename(columns={"order charged date": "transaction date"}, inplace=True)
            
            if "order charged timestamp" in df.columns: 
                df.rename(columns={"order charged timestamp": "transaction time"}, inplace=True)

            if "country of buyer" in df.columns: 
                df.rename(columns={"country of buyer": "buyer country"}, inplace=True)

            if "state of buyer" in df.columns: 
                df.rename(columns={"state of buyer": "buyer state"}, inplace=True)
            
            if "postal code of buyer" in df.columns: 
                df.rename(columns={"postal code of buyer": "buyer postal code"}, inplace=True)

            if "device model" in df.columns:
                df.rename(columns={"device model": "hardware"}, inplace=True)

            if "currency of sale" in df.columns:
                df.rename(columns={"currency of sale": "buyer currency"}, inplace=True)

            if "item price" in df.columns: 
                df.rename(columns={"item price": "amount (buyer currency)"}, inplace=True)

            


            sales_df = pd.concat([sales_df, df], ignore_index=True)
        elif "crashes" in file:
            # print(file + " " + encoding + " " + str(df.shape))
            crashes_df = pd.concat([crashes_df, df], ignore_index=True)
        elif "ratings" in file:
            if "overview" in file:
                # print(file + " " + encoding + " " + str(df.shape))
                ratings_overview_df = pd.concat([ratings_overview_df, df], ignore_index=True)
            elif "country" in file:
                # print(file + " " + encoding + " " + str(df.shape))
                ratings_country_df = pd.concat([ratings_country_df, df], ignore_index=True)
    except UnicodeDecodeError:
        pass
    

print(len(reviews_df))
print(len(sales_df))
print(len(crashes_df))
print(len(ratings_overview_df))
print(len(ratings_country_df))

print(sales_df.columns.tolist())
display(sales_df)



78
3487
214
214
18617
['description', 'transaction date', 'transaction time', 'tax type', 'transaction type', 'refund type', 'product title', 'product id', 'product type', 'sku id', 'hardware', 'buyer country', 'buyer state', 'buyer postal code', 'buyer currency', 'amount (buyer currency)', 'currency conversion rate', 'merchant currency', 'amount (merchant currency)', 'base plan id', 'offer id', 'financial status', 'taxes collected', 'charged amount', 'city of buyer']


Unnamed: 0,description,transaction date,transaction time,tax type,transaction type,refund type,product title,product id,product type,sku id,...,amount (buyer currency),currency conversion rate,merchant currency,amount (merchant currency),base plan id,offer id,financial status,taxes collected,charged amount,city of buyer
0,GPA.3314-3067-8984-86281,"Sep 1, 2021",1:09:37 AM PDT,,Charge,,DM Tools (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,1,premium,...,2.08,1.00000,EUR,2.08,,,,,,
1,GPA.3311-2378-8945-03309,"Sep 1, 2021",8:35:27 AM PDT,,Charge,,DM Tools (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,1,premium,...,3.49,0.84440,EUR,2.95,,,,,,
2,GPA.3311-2378-8945-03309,"Sep 1, 2021",8:35:27 AM PDT,,Google fee,,DM Tools (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,1,premium,...,-0.52,0.84440,EUR,-0.44,,,,,,
3,GPA.3303-4045-0553-92783,"Sep 1, 2021",12:27:40 PM PDT,,Charge,,DM Tools (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,1,premium,...,1.84,1.16340,EUR,2.14,,,,,,
4,GPA.3359-6962-8916-33548,"Sep 1, 2021",2:11:11 PM PDT,,Charge,,Character Manager (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,1,unlockcharactermanager,...,5.49,0.84475,EUR,4.64,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3482,GPA.3371-8563-4424-08833,"Jul 31, 2021",9:13:33 PM PDT,,Google fee,,Character Manager (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,1,unlockcharactermanager,...,-0.82,0.84265,EUR,-0.69,,,,,,
3483,GPA.3366-5335-7447-64180,"Jul 31, 2021",9:30:11 PM PDT,,Charge,,DM Tools (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,1,premium,...,3.49,0.84265,EUR,2.94,,,,,,
3484,GPA.3366-5335-7447-64180,"Jul 31, 2021",9:30:11 PM PDT,,Google fee,,DM Tools (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,1,premium,...,-0.52,0.84265,EUR,-0.44,,,,,,
3485,GPA.3386-7086-1527-85919,"Jul 31, 2021",10:19:14 PM PDT,,Charge,,Character Manager (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,1,unlockcharactermanager,...,5.49,0.84265,EUR,4.63,,,,,,


# Sales data

In [17]:
# Clean sales data
# sales_df.columns = sales_df.columns.str.strip().str.lower()

df = sales_df[sales_df['product id'] == product_id].copy()
# df["Transaction Date"] = pd.to_datetime(df["Transaction Date"])
# df["Amount (Merchant Currency) Google tax"] = df["Amount (Merchant Currency)"]

# The data format varies across formats
def parse_datetime(row):
    """
    Convert transaction_date and transaction_time into a single datetime column.
    Handles both:
    - Format 1: "Jun 1, 2021" + "12:15:21 PM PDT"
    - Format 2: "2021-12-31" + "1640956762" (Unix timestamp)
    """
    try:
        # Convert date
        date_str = str(row["transaction date"]).strip()
        if "," in date_str:  # "Jun 1, 2021" format
            date_parsed = pd.to_datetime(date_str, format="%b %d, %Y", errors="coerce")
        else:  # "2021-12-31" format
            date_parsed = pd.to_datetime(date_str, format="%Y-%m-%d", errors="coerce")

        # Convert time
        time_str = str(row["transaction time"]).strip()
        if time_str.isdigit():  # Unix timestamp case
            time_parsed = pd.to_datetime(int(time_str), unit="s", errors="coerce").time()
        else:  # Regular time format
            time_parsed = pd.to_datetime(time_str.replace(" PDT", ""), format="%I:%M:%S %p", errors="coerce").time()

        # Combine date and time
        if pd.notna(date_parsed) and pd.notna(time_parsed):
            return pd.Timestamp.combine(date_parsed, time_parsed)
        else:
            return pd.NaT  # Return NaT if invalid
    
    except Exception as e:
        print(f"Error parsing row {row}: {e}")
        return pd.NaT  # Return NaT for errors
    
from currency_converter import CurrencyConverter
c = CurrencyConverter(fallback_on_missing_rate=True)

def convert_currency(row):
    """
    Convert 'amount (buyer currency)' into 'amount (merchant currency)' 
    if 'amount (merchant currency)' is NaN.
    """
    try:
        # Check if merchant amount is NaN and buyer amount is not NaN
        if pd.isna(row["amount (merchant currency)"]) and pd.notna(row["amount (buyer currency)"]):
            # Perform conversion using the CurrencyConverter package
            return c.convert(row["amount (buyer currency)"], row["buyer currency"], "EUR", row["datetime"].date())
        else:
            return row["amount (merchant currency)"]
    except Exception as e:
        print(f"Error converting currency for row {row['description']}: {e}")
        return row["amount (merchant currency)"]


def clean_sales_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean and process sales data by combining charge and fee rows.
    
    Args:
        df (pd.DataFrame): Input DataFrame with sales data
        
    Returns:
        pd.DataFrame: Cleaned DataFrame with combined transactions
    """
    # Create a copy of the input DataFrame to avoid the SettingWithCopyWarning
    df = df.copy()

    df["datetime"] = df.apply(parse_datetime, axis=1)
    
    # # Convert date and time to datetime, properly handling PDT timezone
    # df['datetime'] = pd.to_datetime(
    #     df['transaction date'] + ' ' + df['transaction time'].str.replace(' PDT', '')
    # )
 
    # Fill NaN values using Currency Converter
    df["amount (merchant currency)"] = df.apply(convert_currency, axis=1)
    
    def combine_transactions(group):
        """Combine charge and fee rows into a single transaction"""
        if len(group) == 2 and set(group['transaction type']) == {'Charge', 'Google fee'}:
            charge_row = group[group['transaction type'] == 'Charge'].iloc[0]
            fee_row = group[group['transaction type'] == 'Google fee'].iloc[0]
            
            # Calculate net amounts
            net_buyer_amount = charge_row['amount (buyer currency)'] + fee_row['amount (buyer currency)']
            net_merchant_amount = charge_row['amount (merchant currency)'] + fee_row['amount (merchant currency)']
            
            # Create result row
            result = charge_row.copy()
            result['amount (buyer currency)'] = net_buyer_amount
            result['amount (merchant currency)'] = net_merchant_amount
            result['transaction type'] = 'Net Charge'
            return result
        
        return group.iloc[0]

    # Columns to keep in the final output
    columns_to_keep = [
        'description', 'datetime', 'product title', 
        'product id', 'sku id', 'buyer country',
        'buyer currency', 'amount (buyer currency)',
        'merchant currency', 'amount (merchant currency)'
    ]
    
    # Group and combine transactions
    result_df = (df.groupby('description', as_index=False)
                  .apply(combine_transactions)
                  [columns_to_keep]
                  .sort_values('datetime')
                  .reset_index(drop=True))
    
    return result_df

# Example usage
cleaned_df = clean_sales_data(df)
display(cleaned_df)

# duplicates = cleaned_df[cleaned_df.duplicated("Description", keep=False)]
# print("Duplicates:")
# print(duplicates)



Error converting currency for row GPA.3313-3300-8778-25902: COP is not a supported currency
Error converting currency for row GPA.3313-3300-8778-25902: COP is not a supported currency
Error converting currency for row GPA.3384-8046-7908-11492: CRC is not a supported currency
Error converting currency for row GPA.3394-7011-7992-67654: CRC is not a supported currency


  .apply(combine_transactions)


Unnamed: 0,description,datetime,product title,product id,sku id,buyer country,buyer currency,amount (buyer currency),merchant currency,amount (merchant currency)
0,GPA.3370-7096-7934-01916,2021-06-01 17:38:09,Character Manager (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,unlockcharactermanager,US,USD,3.84,EUR,3.140000
1,GPA.3301-2849-0660-49349,2021-06-01 22:46:39,DM Tools (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,premium,US,USD,2.44,EUR,2.000000
2,GPA.3372-1497-1097-13226,2021-06-02 07:51:43,Character Manager (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,unlockcharactermanager,US,USD,3.84,EUR,3.160000
3,GPA.3397-6490-8608-67650,2021-06-02 09:14:16,Character Manager (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,unlockcharactermanager,US,USD,3.84,EUR,3.150000
4,GPA.3378-4840-7906-77859,2021-06-02 10:32:52,DM Tools (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,premium,US,USD,2.44,EUR,2.000000
...,...,...,...,...,...,...,...,...,...,...
1698,GPA.3358-8615-1171-80349,2021-12-31 13:19:22,DM Tools (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,premium,US,USD,3.49,,3.081406
1699,GPA.3397-7418-5342-99790,2021-12-31 13:19:57,Character Manager (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,unlockcharactermanager,US,USD,5.49,,4.847254
1700,GPA.3333-5146-4957-35294,2021-12-31 19:01:21,DM Tools (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,premium,GB,GBP,2.16,,2.570572
1701,GPA.3306-0097-9714-42420,2021-12-31 22:27:39,DM Tools (Complete Reference for DnD 5),com.vansteinengroentjes.apps.ddfive,premium,US,USD,3.49,,3.081406


## Sales over time

In [18]:
output_notebook()

def plot_sales_all(df):
    grouped_by_transaction = df.groupby(pd.Grouper(key="datetime", freq='1D'))["amount (merchant currency)"].sum().reset_index()
    grouped_by_count = df.groupby(pd.Grouper(key="datetime", freq='1D')).count().reset_index()
    p = figure(title="Simple line example",x_axis_type="datetime", x_axis_label='date', y_axis_label='sales')
    p.line(grouped_by_transaction["datetime"], grouped_by_transaction["amount (merchant currency)"], legend_label="Sales prices", line_width=2)
    p.line(grouped_by_count["datetime"], grouped_by_count["amount (merchant currency)"], legend_label="Sales volume", line_width=2, color="red")
    # show(p)
    return p

def plot_sales_filtered(df):
    filtered_df = df[df["sku id"] == "premium"]
    grouped_by_transaction = filtered_df.groupby(pd.Grouper(key="datetime", freq='1D'))["amount (merchant currency)"].sum().reset_index()
    grouped_by_count = filtered_df.groupby(pd.Grouper(key="datetime", freq='1D')).count().reset_index()
    p = figure(title="Premium",x_axis_type="datetime", x_axis_label='date', y_axis_label='sales')
    p.line(grouped_by_transaction["datetime"], grouped_by_transaction["amount (merchant currency)"], legend_label="Sales prices", line_width=2)
    p.line(grouped_by_count["datetime"], grouped_by_count["amount (merchant currency)"], legend_label="Sales volume", line_width=2, color="red")
    # show(p)
    return p

layout = row(plot_sales_all(cleaned_df.copy()), plot_sales_filtered(cleaned_df.copy()))  # Stacks vertically
show(layout)

In [19]:
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.layouts import column, row
from bokeh.models import Select, ColumnDataSource, CustomJS

output_notebook()

df = cleaned_df.copy()

def create_daily_metrics(sku_id):
    filtered_df = df[df["sku id"] == sku_id]
    sums = filtered_df.groupby(pd.Grouper(key="datetime", freq='1D'))["amount (merchant currency)"].sum().reset_index()
    counts = filtered_df.groupby(pd.Grouper(key="datetime", freq='1D')).count().reset_index()
    return sums, counts

premium_sums, premium_counts = create_daily_metrics("premium")
unlock_sums, unlock_counts = create_daily_metrics("unlockcharactermanager")

# Create ColumnDataSources
premium_source = ColumnDataSource({
    'date': premium_sums['datetime'],
    'sum': premium_sums['amount (merchant currency)'],
    'count': premium_counts['amount (merchant currency)']
})
unlock_source = ColumnDataSource({
    'date': unlock_sums['datetime'],
    'sum': unlock_sums['amount (merchant currency)'],
    'count': unlock_counts['amount (merchant currency)']
})

# Create figure
p = figure(width=800, height=400, x_axis_type="datetime",
          title="Product Revenue Analysis")
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Value'

# Create lines for both metrics
premium_line = p.line('date', 'sum', line_color='blue', 
                     legend_label='Premium', source=premium_source)
unlock_line = p.line('date', 'sum', line_color='red', 
                    legend_label='Character Manager', source=unlock_source)

# Create Select widgets
product_select = Select(title="Product Type", 
                       options=["Both", "Premium Only", "Character Manager Only"],
                       value="Both")

metric_select = Select(title="Metric Type",
                      options=["Revenue", "Count"],
                      value="Revenue")

# Create JavaScript callback
callback = CustomJS(args=dict(premium_line=premium_line,
                            unlock_line=unlock_line,
                            premium_source=premium_source,
                            unlock_source=unlock_source,
                            product_select=product_select,
                            metric_select=metric_select), 
                   code="""
    const product = product_select.value;
    const metric = metric_select.value;
    
    // Update y-values based on metric
    const field = metric === "Revenue" ? "sum" : "count";
    premium_line.glyph.y = {field: field};
    unlock_line.glyph.y = {field: field};
    
    // Update visibility based on product selection
    if (product === "Both") {
        premium_line.visible = true;
        unlock_line.visible = true;
    } else if (product === "Premium Only") {
        premium_line.visible = true;
        unlock_line.visible = false;
    } else {
        premium_line.visible = false;
        unlock_line.visible = true;
    }
""")

# Attach callbacks
product_select.js_on_change('value', callback)
metric_select.js_on_change('value', callback)

# Show the plot
revenue = column(row(product_select, metric_select), p)
show(revenue)

# Review data

In [59]:
import pandas as pd
from bokeh.models import Range1d, LinearAxis, RangeTool
from bokeh.layouts import column, row

# Clean ratings data
df_ratings = ratings_country_df.copy()
df_ratings["Date"] = pd.to_datetime(df_ratings["Date"])
df_ratings = df_ratings.groupby(pd.Grouper(key="Date", freq='1D'))["Total Average Rating"].mean()

# Clean crashes data
df_crash = crashes_df.copy()
df_crash["Date"] = pd.to_datetime(df_crash["Date"])
df_crash = df_crash.sort_values(by="Date")

# Normalize data for selection plot
ratings_normalized = (df_ratings - df_ratings.min()) / (df_ratings.max() - df_ratings.min())
crashes_normalized = (df_crash["Daily Crashes"] - df_crash["Daily Crashes"].min()) / (df_crash["Daily Crashes"].max() - df_crash["Daily Crashes"].min())

# create shared range for the x-axis
x_range = (df_ratings.index.min(), df_ratings.index.max())

# Create main plots with original values
p = figure(width=600, height=300, x_axis_type="datetime", x_range=x_range,
          tools="xpan", toolbar_location="above")
p.yaxis.axis_label = 'Rating'
p.line(df_ratings.index, df_ratings.values, line_color='blue')

p2 = figure(width=600, height=300, x_axis_type="datetime", x_range=p.x_range,
           tools="xpan", toolbar_location="above")
p2.yaxis.axis_label = 'Crashes'
p2.line(df_crash["Date"], df_crash["Daily Crashes"], line_color='red')

# Create selection plot with normalized values
select = figure(title="Drag to select time range",
                height=130, width=1200,
                x_axis_type="datetime", y_axis_type=None,
                tools="", toolbar_location=None,
                background_fill_color="#efefef")

# Add normalized lines to selection plot
select.line(df_ratings.index, ratings_normalized, line_color='blue')
select.line(df_crash["Date"], crashes_normalized, line_color='red')

# Configure RangeTool
range_tool = RangeTool(x_range=p.x_range)
range_tool.overlay.fill_color = "navy"
range_tool.overlay.fill_alpha = 0.2

select.ygrid.grid_line_color = None
select.add_tools(range_tool)

merged_df = pd.merge(df_crash, df_ratings, on='Date', suffixes=('_crash', '_rating'))


dates = pd.date_range(start=x_range[0], end=x_range[1], freq='D')

# from 0 to 1 based on the dates
dates = pd.DataFrame(dates, columns=["Date"])
dates["color"] = (dates["Date"] - dates["Date"].min()) / (dates["Date"].max() - dates["Date"].min())
dates["color"] = (dates["color"] * 255).astype(int)
dates["color"] = dates["color"].apply(lambda x: (x, 150, 150))

# make new plot with crashes vs ratings
p3 = figure(width=1200, height=400, x_axis_label='Crashes', y_axis_label='Rating')
p3.scatter(merged_df["Daily Crashes"], merged_df["Total Average Rating"], color=dates["color"], size=10, fill_alpha=0.6)

# calculate correlation
correlation = merged_df["Daily Crashes"].corr(merged_df["Total Average Rating"])

p3.title.text = f"Correlation: {correlation:.2f} | More red = more recent"


# Show the plots
review = column(
    row(
        KPI(f"Total crashes: {df_crash['Daily Crashes'].sum()}"),
        KPI(f"Crashes increased ⬆{df_crash['Daily Crashes'].pct_change().mean() * 100:.2f}%"),
        KPI(f"Average ratings: {df_ratings.mean().round(2)} ⭐️"),
        KPI(f"Correlation: {correlation:.2f}")
    ),
    row(p, p2), 
    select, 
    p3
)
show(review)

In [21]:
from bokeh.palettes import Category10

df_countryratings = ratings_country_df.copy()

df_countryratings["Date"] = pd.to_datetime(df_countryratings["Date"])
df_countryratings = df_countryratings.dropna(subset=["Total Average Rating"])

df_country_pivot = df_countryratings.pivot(index="Date", columns="Country", values="Total Average Rating")

p = figure(title="Total Average Rating Trend by Country", x_axis_type="datetime",
           width=900, height=500, tools="pan,wheel_zoom,box_zoom,reset,save")

colors = Category10[10]

for i, country in enumerate(df_country_pivot.columns[:10]):  # Limit to 10 countries for readability
    source = ColumnDataSource(data=dict(x=df_country_pivot.index, y=df_country_pivot[country]))
    p.line("x", "y", source=source, line_width=2, color=colors[i % len(colors)], legend_label=country)

p.xaxis.axis_label = "Date"
p.yaxis.axis_label = "Total Average Rating"
p.legend.title = "Country"
p.legend.location = "top_left"
p.legend.click_policy = "hide"  # Allows clicking legend to hide/show lines

# Add hover tool
hover = HoverTool(tooltips=[("Date", "@x{%F}"), ("Rating", "@y")], formatters={"@x": "datetime"})
p.add_tools(hover)


display(df_countryratings)
display(df_country_pivot)
show(p)


Unnamed: 0,Date,Package Name,Country,Daily Average Rating,Total Average Rating
0,2021-07-01,com.vansteinengroentjes.apps.ddfive,AR,,4.20
1,2021-07-01,com.vansteinengroentjes.apps.ddfive,AT,,3.83
2,2021-07-01,com.vansteinengroentjes.apps.ddfive,AU,,4.19
3,2021-07-01,com.vansteinengroentjes.apps.ddfive,BA,,5.00
4,2021-07-01,com.vansteinengroentjes.apps.ddfive,BD,,5.00
...,...,...,...,...,...
18612,2021-09-30,com.vansteinengroentjes.apps.ddfive,US,,4.03
18613,2021-09-30,com.vansteinengroentjes.apps.ddfive,UY,,4.86
18614,2021-09-30,com.vansteinengroentjes.apps.ddfive,VE,,4.00
18615,2021-09-30,com.vansteinengroentjes.apps.ddfive,VN,,5.00


Country,AR,AT,AU,BA,BD,BE,BG,BH,BN,BR,...,SV,TH,TR,TW,UA,US,UY,VE,VN,ZA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-01,4.2,3.91,4.19,5.0,5.0,3.70,5.0,5.0,5.0,4.35,...,5.0,5.0,4.71,4.0,4.75,4.04,4.86,4.0,5.0,4.35
2021-06-02,4.2,3.91,4.19,5.0,5.0,3.70,5.0,5.0,5.0,4.35,...,5.0,5.0,4.71,4.0,4.75,4.04,4.86,4.0,5.0,4.35
2021-06-03,4.2,3.91,4.19,5.0,5.0,3.70,5.0,5.0,5.0,4.35,...,5.0,5.0,4.71,4.0,4.75,4.04,4.86,4.0,5.0,4.35
2021-06-04,4.2,3.91,4.19,5.0,5.0,3.70,5.0,5.0,5.0,4.35,...,5.0,5.0,4.71,4.0,4.75,4.04,4.86,4.0,5.0,4.35
2021-06-05,4.2,3.91,4.19,5.0,5.0,3.70,5.0,5.0,5.0,4.35,...,5.0,5.0,4.71,4.0,4.75,4.04,4.86,4.0,5.0,4.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-27,4.2,3.83,4.18,5.0,5.0,3.70,5.0,5.0,5.0,4.34,...,5.0,5.0,4.72,4.0,4.75,4.03,4.86,4.0,5.0,4.35
2021-12-28,4.2,3.83,4.18,5.0,5.0,3.70,5.0,5.0,5.0,4.34,...,5.0,5.0,4.72,4.0,4.75,4.03,4.86,4.0,5.0,4.35
2021-12-29,4.2,3.83,4.18,5.0,5.0,3.70,5.0,5.0,5.0,4.34,...,5.0,5.0,4.72,4.0,4.75,4.03,4.86,4.0,5.0,4.35
2021-12-30,4.2,3.83,4.18,5.0,5.0,3.70,5.0,5.0,5.0,4.34,...,5.0,5.0,4.72,4.0,4.75,4.03,4.86,4.0,5.0,4.35


# Spatial data

In [22]:
df_ratings = ratings_country_df.copy()
df_ratings["Date"] = pd.to_datetime(df_ratings["Date"])
df_ratings = df_ratings.groupby(df_ratings["Country"])["Total Average Rating"].mean()

# get the lat and lon for each country
import geopandas as gpd
lat_long_country = pd.read_csv("./data/longlat.csv")

# Convert latitude and longitude to numeric, coercing errors to NaN
lat_long_country["latitude"] = pd.to_numeric(lat_long_country["latitude"], errors='coerce')
lat_long_country["longitude"] = pd.to_numeric(lat_long_country["longitude"], errors='coerce')

# Drop rows with NaN values
lat_long_country.dropna(subset=["latitude", "longitude"], inplace=True)

lat_long_country.head()

# # convert lat and lon to web mercator
from pyproj import Proj, transform

inProj = Proj(init='epsg:4326')
outProj = Proj(init='epsg:3857')

lat_long_country["longitude"], lat_long_country["latitude"] = transform(inProj, outProj, lat_long_country["longitude"].values, lat_long_country["latitude"].values)

from bokeh.plotting import figure

p = figure(x_range=(-2000000, 6000000), y_range=(-1000000, 7000000),
           x_axis_type="mercator", y_axis_type="mercator")
p.add_tile("CARTODBPOSITRON")

# Show the plot
show(p)




  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  lat_long_country["longitude"], lat_long_country["latitude"] = transform(inProj, outProj, lat_long_country["longitude"].values, lat_long_country["latitude"].values)


In [23]:
df_ratings = ratings_country_df.copy()
df_ratings["Date"] = pd.to_datetime(df_ratings["Date"])
df_ratings = df_ratings.groupby(df_ratings["Country"])["Total Average Rating"].mean()
# rename Total Average Rating to rating
df_ratings = df_ratings.rename("rating")

# get the lat and lon for each country
lat_long_country = pd.read_csv("./data/longlat.csv")

# Convert latitude and longitude to numeric, coercing errors to NaN
lat_long_country["latitude"] = pd.to_numeric(lat_long_country["latitude"], errors='coerce')
lat_long_country["longitude"] = pd.to_numeric(lat_long_country["longitude"], errors='coerce')

# Drop rows with NaN values
lat_long_country.dropna(subset=["latitude", "longitude"], inplace=True)

from pyproj import Proj, transform

inProj = Proj(init='epsg:4326')
outProj = Proj(init='epsg:3857')

lat_long_country["longitude"], lat_long_country["latitude"] = transform(inProj, outProj, lat_long_country["longitude"].values, lat_long_country["latitude"].values)


# merge the dataframes
lat_long_country = pd.merge(lat_long_country, df_ratings, left_on="country", right_index=True)
lat_long_country["color"] = (lat_long_country["rating"] - lat_long_country["rating"].min()) / (lat_long_country["rating"].max() - lat_long_country["rating"].min())
lat_long_country["color"] = (lat_long_country["color"] * 255).astype(int)
lat_long_country["color"] = lat_long_country["color"].apply(lambda x: (255 - x, x, 0))

# amount of start from 0 to 5 using ⭐️ emoji
lat_long_country["stars"] = lat_long_country["rating"].apply(lambda x: "⭐️" * int(round(x)))


print(lat_long_country.head())

from bokeh.models import ColumnDataSource, HoverTool, LabelSet
from bokeh.plotting import figure, show

# Create a ColumnDataSource from your DataFrame
source = ColumnDataSource(lat_long_country)

# Create the figure
map = figure(x_range=(-4000000, 6000000), y_range=(-1000000, 7000000),
           x_axis_type="mercator", y_axis_type="mercator", width=800, height=600)

# Add the tile
map.add_tile("CARTODBPOSITRON")

# Add circle markers with hover capability
circles = map.scatter(x='longitude', y='latitude', size=10, 
                  source=source, fill_color='color', 
                  fill_alpha=0.8, line_color='white')

# Add HoverTool
hover = HoverTool(tooltips=[
    ('Country', '@country'),
    ('Name', '@name'),
    ('Rating', '@rating'),
    ('Stars', '@stars')
])

labels = LabelSet(x='longitude', y='latitude', text='name', x_offset=5, y_offset=5, source=source, border_line_color='black', background_fill_color='white')

map.add_layout(labels)
map.add_tools(hover)

# Show the plot
show(map)

   country      latitude     longitude                    name    rating  \
10      AR -4.638374e+06 -7.081776e+06               Argentina  4.200000   
12      AT  6.026747e+06  1.619707e+06                 Austria  3.834112   
13      AU -2.909486e+06  1.489178e+07               Australia  4.193551   
16      BA  5.452435e+06  1.968026e+06  Bosnia and Herzegovina  5.000000   
18      BD  2.715070e+06  1.005842e+07              Bangladesh  5.000000   

           color       stars  
10  (51, 204, 0)    ⭐️⭐️⭐️⭐️  
12  (75, 180, 0)    ⭐️⭐️⭐️⭐️  
13  (52, 203, 0)    ⭐️⭐️⭐️⭐️  
16   (0, 255, 0)  ⭐️⭐️⭐️⭐️⭐️  
18   (0, 255, 0)  ⭐️⭐️⭐️⭐️⭐️  


  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  lat_long_country["longitude"], lat_long_country["latitude"] = transform(inProj, outProj, lat_long_country["longitude"].values, lat_long_country["latitude"].values)


In [None]:
df_ratings = ratings_country_df.copy()
df_ratings["Date"] = pd.to_datetime(df_ratings["Date"])
# plot the biggest change in rating for each country, use the min and max rating
df_ratings = df_ratings.groupby(df_ratings["Country"]).agg({"Total Average Rating": ["first", "last"]})
df_ratings.columns = df_ratings.columns.droplevel()
df_ratings["change"] = df_ratings["last"] - df_ratings["first"]
df_ratings = df_ratings.sort_values(by="change", ascending=False)

# plot in bokeh table
from bokeh.models import ColumnDataSource, DataTable, TableColumn

source = ColumnDataSource(df_ratings)

columns = [TableColumn(field="Country", title="Country"),
              TableColumn(field="first", title="First Rating"),
              TableColumn(field="last", title="Last Rating"),
              TableColumn(field="change", title="Change")]

country_rating_change_table = DataTable(source=source, columns=columns, width=800, height=600)

show(country_rating_change_table)

# key metrics (most positive and negative change in rating) and also get the change in percentage
df_ratings["change_percentage"] = (df_ratings["change"] / df_ratings["first"]) * 100
most_positive_change = df_ratings["change"].idxmax()
most_negative_change = df_ratings["change"].idxmin()
most_positive_change_percentage = df_ratings["change_percentage"].idxmax()
most_negative_change_percentage = df_ratings["change_percentage"].idxmin()
# plot these two numbers
from bokeh.models import Div

div = Div(text=f"""
<div style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">
<h1>⬆{df_ratings.loc[most_positive_change, "change_percentage"]:.2f}% change in {most_positive_change} ratings</h1>
</div>
""")



# <h1>Most Negative Change in Rating</h1>
# <p>Country: {most_negative_change}</p>
# <p>Change: {df_ratings.loc[most_negative_change, "change"]}</p>
# <p>Change Percentage: {df_ratings.loc[most_negative_change, "change_percentage"]:.2f}%</p>

# <h1>Most Positive Change in Rating Percentage</h1>
# <p>Country: {most_positive_change_percentage}</p>
# <p>Change: {df_ratings.loc[most_positive_change_percentage, "change"]}</p>
# <p>Change Percentage: {df_ratings.loc[most_positive_change_percentage, "change_percentage"]:.2f}%</p>
# <h1>Most Negative Change in Rating Percentage</h1>
# <p>Country: {most_negative_change_percentage}</p>
# <p>Change: {df_ratings.loc[most_negative_change_percentage, "change"]}</p>
# <p>Change Percentage: {df_ratings.loc[most_negative_change_percentage, "change_percentage"]:.2f}%</p>

country_kpis = row(
    KPI(f"⬆{df_ratings.loc[most_positive_change, "change_percentage"]:.2f}% change in {most_positive_change} ratings"),
    KPI(f"⬇{df_ratings.loc[most_negative_change, "change_percentage"]:.2f}% change in {most_negative_change} ratings"),
)



# Layout

In [61]:
title = Div(text="<h1 style='color: blue;'>Dashboard: D&D</h2>", width=700)

# map column:
geographics_map_column = column(country_kpis, map, country_rating_change_table)

tab1 = TabPanel(child = revenue, title = "Sales Panel")
tab2 = TabPanel(child = review, title = "Review Panel")
tab3 = TabPanel(child = geographics_map_column, title = "Geograpics")

tabs_object = Tabs(tabs = [tab1, tab2, tab3])
layout = column(title, tabs_object)

# layout = column(title, revenue, review)


# save to html
from bokeh.io import output_file
output_file("dashboard.html")
show(layout)