# Notebook Setup

## ✔ Get My BQ Credentials to Access the Dataset

## ✔ Load Directory Locations

In [1]:
import json
import os

# Check if the file exists and load the JSON file into a dictionary
file_path = r'C:\Users\mike\Develop\Projects\Code Notebook\Credentials\locations_conf.json'
if os.path.exists(file_path):
    with open(file_path, 'r') as f:
        locations_data = json.load(f)
    for key, value in locations_data.items():
        if key == 'BQ_Service_Key':
            # Mask the final part of the key for security
            value = value.rsplit('/', 1)[0] + '/***'
        print(f"{key}: {value}")
else:
    print(f"File not found: {file_path}")

Common_Funcs_Dir: /Users/mike/Develop/Projects/Code Notebook/Common/Functions
Credentials_Dir: /Users/mike/Develop/Projects/Code Notebook/Credentials
Rel_Pickes_Dir: ../.pickles
Pub_Data_Dir: '/Users/mike/Data/Public
BQ_Service_Key: /Users/mike/Develop/Conf/GCP Service Keys/***


## ✔ Connect to Google Cloud

In [2]:
from google.oauth2 import service_account

# Resolve the key path from the locations data
key_path = locations_data.get('BQ_Service_Key', 'default_key_path.json')

# Create credentials using the key file
credentials = service_account.Credentials.from_service_account_file(key_path)

## ✔ Notebook Functions

### Text Reformattor

In [3]:
def reformat_text(data: str) -> str:
    """
    Reformat the input text by performing the following:
    - Replace standalone hyphens with bullet points and add HTML <br> tags for new lines.
    - Preserve hyphens in hyphenated words (e.g., "flat-rate pricing").
    - Add a break after each sentence ends, retaining the period, but not splitting on floats.
    - Handle cases with multiple consecutive periods.
    - Remove leading white space prior to each new line.
    - Replace multiple spaces with a single space.
    - Add indentation to each bullet point.

    Args:
        data (str): The input string to be reformatted.

    Returns:
        str: The reformatted string.
    """
    import re

    # Replace standalone hyphens with bullet points and add HTML <br> tags for new lines
    data = re.sub(r'(?<!\w)-(?=\s)', '•', data)  # Replace standalone hyphens
    data = re.sub(r'•', '<br>•', data)

    # Add a break after each sentence ends, retaining the period, but not splitting on floats
    data = re.sub(r'(?<!\d)\.(?!\d)', '.<br>', data)

    # Handle cases with multiple consecutive periods (e.g., "...") by replacing them with a single period followed by a break
    data = re.sub(r'\.{2,}', '.<br>', data)

    # Remove leading white space prior to each new line
    data = re.sub(r'<br>\s+', '<br>', data)

    # Replace multiple spaces with a single space
    data = re.sub(r'\s{2,}', ' ', data)

    # Add indentation to each bullet point
    data = re.sub(r'<br>•', '<br>&nbsp;&nbsp;&nbsp;&nbsp;•', data)

    return data


# DATA OFFICE TECHNICAL SKILLS ASSESSMENT
This test is designed to give us a better understanding of how our candidates leverage their technical skills to answer business questions and communicate findings.
To complete the assignment you will have to work with the following Big Query public data set:
Chicago Taxi Trips dataset:
`bigquery-public-data.chicago_taxi_trips.taxi_trips`

# PART I

## ✔ Please submit your SQL code (follow [URL](https://github.com/mcancell/Py-ETL-Notebook/blob/main/Datasets/Chicago%20Taxi%20Trips/Chi_Taxi_Trip_Insights_Branch_Part_I.bqsql)):
showing how you approached the data to be able to answer the following two questions. Note that a query that returns a lot of data and needs to be exported/manipulated in Excel is not preferred. A query that just returns the answer requested and no other data is preferred.

[See Code Here](https://github.com/mcancell/Py-ETL-Notebook/blob/main/Datasets/Chicago%20Taxi%20Trips/Chi_Taxi_Trip_Insights_Branch_Part_I.bqsql)

## ✔ Load & Show Query

In [4]:
# Define the final filename as a variable
filename = 'Chi_Taxi_Trip_Insights_Branch_Part_I.bqsql'

# Dynamically construct the absolute path to the SQL file
sql_file_path = os.path.abspath(os.path.join('C:\\Users\\mike\\Develop\\Projects\\Code Notebook\\Datasets\\Chicago Taxi Trips', filename))

# Check if the file exists and load the SQL code into a variable
if os.path.exists(sql_file_path):
    with open(sql_file_path, 'r') as sql_file:
        sql_code = sql_file.read()
    print(f"SQL code loaded successfully from file: {filename}")
else:
    print(f"File not found: {sql_file_path}")

SQL code loaded successfully from file: Chi_Taxi_Trip_Insights_Branch_Part_I.bqsql


In [5]:
from IPython.display import display, HTML
import pygments
from pygments.lexers import SqlLexer
from pygments.formatters import HtmlFormatter
from pygments.style import Style
from pygments.token import Keyword, Name, Comment, String, Number, Operator, Punctuation

# Define a custom Pygments style
class CustomSQLStyle(Style):
    default_style = ""
    styles = {
        Comment: "italic #888888",  # Gray for comments
        Keyword: "bold #005cc5",  # Blue for keywords
        Name: "bold #22863a",  # Green for names
        String: "italic #d73a49",  # Red for strings
        Number: "bold #6f42c1",  # Purple for numbers
        Operator: "bold #e36209",  # Orange for operators
        Punctuation: "bold #24292e",  # Black for punctuation
    }

# Highlight the SQL code using Pygments with the custom style
formatter = HtmlFormatter(style=CustomSQLStyle, full=False, noclasses=True)
highlighted_sql = pygments.highlight(sql_code, SqlLexer(), formatter)

# Display the highlighted SQL code in the notebook
display(HTML(highlighted_sql))

In [6]:
import warnings
from pandas_gbq.exceptions import LargeResultsWarning

# Suppress the LargeResultsWarning
warnings.simplefilter('ignore', category=LargeResultsWarning)

# Import the pandas_gbq library
import pandas_gbq

# Define the SQL query
query = sql_code

# Read the data from BigQuery into a pandas DataFrame
Insights_Part_I = pandas_gbq.read_gbq(query, project_id=credentials.project_id, credentials=credentials)

# Display the first few rows of the dataframe
# print(Insights_Part_I.head())


Downloading: 100%|[32m██████████[0m|


## ✔ Execute the Query

In [7]:
import warnings
from pandas_gbq.exceptions import LargeResultsWarning

# Suppress the LargeResultsWarning
warnings.simplefilter('ignore', category=LargeResultsWarning)

# Import the pandas_gbq library
import pandas_gbq

# Define the SQL query
query = sql_code

# Read the data from BigQuery into a pandas DataFrame
Insights_Part_I = pandas_gbq.read_gbq(query, project_id=credentials.project_id, credentials=credentials)

# Display the first few rows of the dataframe
# print(Insights_Part_I.head())


Downloading: 100%|[32m██████████[0m|


### ✔ I.a-Largest Month-Over-Month Increase in Trips
Which three distinct taxi companies had the largest month-over-month increase in trips, and what were those months and trip amounts?

#### ✔ Read Metric to Frame

In [8]:
# Filter the DataFrame for I.a insights (Largest Month-Over-Month Increase in Trips)
insights_ia = Insights_Part_I[Insights_Part_I['Metric_Description'] == 'I.a-Largest Month-Over-Month Increase in Trips']

#### ✔ Show Table

##### ✔ Show as Markdown

In [9]:
from IPython.display import Markdown, display

# Generate a markdown table for insights_ia
def display_markdown_table(df):
    # Extract the metric description for the title
    title = f"### {df['Metric_Description'].iloc[0]}"
    
    # Create the markdown table header
    markdown_table = "| Taxi Company | Trip Month | Insight Description |\n"
    markdown_table += "|--------------|------------|----------------------|\n"
    
    # Populate the table rows
    for _, row in df.iterrows():
        markdown_table += f"| {row['Taxi_Company']} | {row['Trip_Month']} | {row['Metric_Insight']} |\n"
    
    # Combine the title and table
    markdown_output = f"{title}\n\n{markdown_table}"
    
    # Display the markdown table
    display(Markdown(markdown_output))

# Call the function to display the table
display_markdown_table(insights_ia)

### I.a-Largest Month-Over-Month Increase in Trips

| Taxi Company | Trip Month | Insight Description |
|--------------|------------|----------------------|
| Taxi Affiliation Services | 2014-03 | In  Mar 2014, trips increased by 115,543 (15.5%) compared to the previous month. |
| Chicago Carriage Cab Corp | 2016-07 | In  Jul 2016, trips increased by 131,272 (370.9%) compared to the previous month. |
| Flash Cab | 2016-01 | In  Jan 2016, trips increased by 276,654 (388.9%) compared to the previous month. |


##### ✔ Show as as Chart

In [10]:
from plotly import graph_objects as go  # Import Plotly graph_objects

# Create a new Plotly figure for the table
fig_iia = go.Figure()

# Add a table to the figure using the insights_ia DataFrame
fig_iia.add_trace(go.Table(
    header=dict(
        values=['<b>Taxi Company</b>', '<b>Trip Month</b>', '<b>Metric Insight</b>'],
        fill_color='grey',
        align='left',
        font=dict(size=12, color='white')
    ),
    cells=dict(
        values=[
            insights_ia['Taxi_Company'], 
            insights_ia['Trip_Month'], 
            insights_ia['Metric_Insight']
        ],
        fill_color='lightgrey',
        align='left',
        font=dict(size=11, color='black')
    )
))

# Update the layout to reduce white space below the table
fig_iia.update_layout(
    title={
        'text': f"<b>{insights_ia['Metric_Description'].iloc[0]}</b>",
        'y': 0.97,  # Position the title above the table
        'x': 0.5,   # Center the title
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=16)
    },
    autosize=True,
    width=1500,  # Set a fixed width to accommodate all columns
    height=400,  # Reduce the height to minimize white space
    margin=dict(l=10, r=10, t=50, b=10)  # Adjust margins for better spacing
)

# Adjust column widths to fit the data
fig_iia.data[0].columnwidth = [1.5, 1, 5]  # Set proportional widths for Taxi Company, Trip Month, and Metric Insight

# Display the interactive table
fig_iia.show()


#### ✔ Show Some Potential Resons for Metric Deltas

In [11]:
from IPython.display import Markdown, display

# Extract the first row of Metric_Reasons for insights_ia
reasons_ia = insights_ia['Metric_Reasons'].iloc[0]
# Apply sentence case to each reason
formatted_reasons = '\n- '.join(reason.strip().capitalize() for reason in reasons_ia.split('\n- '))
# Format and display the reasons
formatted_reasons_ia = f"### Potential Reasons for {insights_ia['Metric_Description'].iloc[0]}:\n- {formatted_reasons}"
display(Markdown(formatted_reasons_ia))

### Potential Reasons for I.a-Largest Month-Over-Month Increase in Trips:
- Potential reasons could include:
- Seasonal changes,
- Marketing campaigns,
- Operational adjustments,
- Increased demand due to events,
- Improved service offerings,
- Increased fleet size,
- Acquisition or merger with former competitor.

### ✔ I.b-Largest Month-Over-Month Decrease in Fare Per Mile 

#### ✔ Read Metric into Frame

In [12]:
# Filter the DataFrame for I.b insights (Largest Month-Over-Month Decrease in Fare Per Mile)
insights_ib = Insights_Part_I[Insights_Part_I['Metric_Description'] == 'I.b-Largest Month-Over-Month Decrease in Fare Per Mile']

#### ✔ Show Table

##### ✔ Show as Markdown

In [13]:
from IPython.display import Markdown, display

# Generate a markdown table for insights_ib
def display_markdown_table_ib(df):
    # Extract the metric description for the title
    title = f"### {df['Metric_Description'].iloc[0]}"
    
    # Create the markdown table header
    markdown_table = "| Taxi Company | Trip Month | Insight Description |\n"
    markdown_table += "|--------------|------------|----------------------|\n"
    
    # Populate the table rows
    for _, row in df.iterrows():
        markdown_table += f"| {row['Taxi_Company']} | {row['Trip_Month']} | {row['Metric_Insight']} |\n"
    
    # Combine the title and table
    markdown_output = f"{title}\n\n{markdown_table}"
    
    # Display the markdown table
    display(Markdown(markdown_output))

# Call the function to display the table
display_markdown_table_ib(insights_ib)

### I.b-Largest Month-Over-Month Decrease in Fare Per Mile

| Taxi Company | Trip Month | Insight Description |
|--------------|------------|----------------------|
| Metro Jet Taxi A. | 2021-08 | In Aug 2021, fare per mile decreased by $50.08 (95.1%) compared to the previous month. |
| 4732 - Maude Lamy | 2016-04 | In Apr 2016, fare per mile decreased by $29.30 (90.2%) compared to the previous month. |
| Blue Ribbon Taxi Association Inc. | 2020-08 | In Aug 2020, fare per mile decreased by $25.25 (49.1%) compared to the previous month. |


##### ✔ Show as Chart

In [14]:
from plotly import graph_objects as go  # Import Plotly graph_objects

# Create a new Plotly figure for the table
fig_ib = go.Figure()

# Add a table to the figure using the insights_ib DataFrame
fig_ib.add_trace(go.Table(
    header=dict(
        values=['<b>Taxi Company</b>', '<b>Trip Month</b>', '<b>Metric Insight</b>'],
        fill_color='grey',
        align='left',
        font=dict(size=12, color='white')
    ),
    cells=dict(
        values=[
            insights_ib['Taxi_Company'], 
            insights_ib['Trip_Month'], 
            insights_ib['Metric_Insight']
        ],
        fill_color='lightgrey',
        align='left',
        font=dict(size=11, color='black')
    )
))

# Update the layout to reduce white space below the table
fig_ib.update_layout(
    title={
        'text': f"<b>{insights_ib['Metric_Description'].iloc[0]}</b>",
        'y': 0.97,  # Position the title above the table
        'x': 0.5,   # Center the title
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=16)
    },
    autosize=True,
    width=1500,  # Set a fixed width to accommodate all columns
    height=400,  # Reduce the height to minimize white space
    margin=dict(l=10, r=10, t=50, b=10)  # Adjust margins for better spacing
)

# Adjust column widths to fit the data
fig_ib.data[0].columnwidth = [1.5, 1, 5]  # Set proportional widths for Taxi Company, Trip Month, and Metric Insight

# Display the interactive table
fig_ib.show()


#### ✔ Show Some Potential Reasons for Metric Deltas

In [15]:
from IPython.display import Markdown, display

# Extract the first row of Metric_Reasons for insights_ia
reasons_ib = insights_ib['Metric_Reasons'].iloc[0]
# Apply sentence case to each reason
formatted_reasons = '\n- '.join(reason.strip().capitalize() for reason in reasons_ia.split('\n- '))
# Format and display the reasons
formatted_reasons_ib = f"### Potential Reasons for {insights_ib['Metric_Description'].iloc[0]}:\n- {formatted_reasons}"
display(Markdown(formatted_reasons_ib))

### Potential Reasons for I.b-Largest Month-Over-Month Decrease in Fare Per Mile:
- Potential reasons could include:
- Seasonal changes,
- Marketing campaigns,
- Operational adjustments,
- Increased demand due to events,
- Improved service offerings,
- Increased fleet size,
- Acquisition or merger with former competitor.

## 2. TBD Executive Summary/Report of Findings
Submit an executive summary/report of your findings, clearly answering the questions above.

# PART II
This portion of the assignment gives our candidates creative freedom to look at this data set in any way they want.
There are no tricks here. This is simply meant to allow us to understand their ability to unearth insights and leverage visualizations to tell a story.

## Additional Analysis Same Dataset
Considering the context of the questions from part I, conduct an additional analysis using the same dataset and design a report that provides at least one additional insight, a trend or any other relevant detail that piques your interest.

This report should:

a. Clearly explain the value or potential use of that observation for someone who is interested in the answers to the questions above.  

b. Include at least one visualization.  

If you have any questions regarding the assignment please contact Noam Berns
noam.berns@ourbranch.com, Austin McCleary austin.mccleary@ourbranch.com and Carson
Wilshire at carson.wilshire@ourbranch.com
Please email your final submission to your Branch recruiter and cc the above three managers.

## ✔ Please submit your SQL code (follow [URL](https://github.com/mcancell/Py-ETL-Notebook/blob/main/Datasets/Chicago%20Taxi%20Trips/Chi_Taxi_Trip_Insights_Branch_Part_II.bqsql)):
Considering the context of the questions from part I, conduct an additional analysis using the same dataset and design a report that provides at least one additional insight, a trend or any other relevant detail that piques your interest.

[See Code Here](https://github.com/mcancell/Py-ETL-Notebook/blob/main/Datasets/Chicago%20Taxi%20Trips/Chi_Taxi_Trip_Insights_Branch_Part_II.bqsql)

### ✔ Load & Show Query

In [16]:
# Define the path to the SQL file
# Define the final filename as a variable
filename = 'Chi_Taxi_Trip_Insights_Branch_Part_II.bqsql'

# Dynamically construct the absolute path to the SQL file
sql_file_path = os.path.abspath(os.path.join('C:\\Users\\mike\\Develop\\Projects\\Code Notebook\\Datasets\\Chicago Taxi Trips', filename))

# Check if the file exists and load the SQL code into a variable
if os.path.exists(sql_file_path):
    with open(sql_file_path, 'r') as sql_file:
        sql_code = sql_file.read()
    print(f"SQL code loaded successfully from file: {filename}")
else:
    print(f"File not found: {sql_file_path}")

SQL code loaded successfully from file: Chi_Taxi_Trip_Insights_Branch_Part_II.bqsql


#### Show Query 

In [17]:
from IPython.display import display, HTML
import pygments
from pygments.lexers import SqlLexer
from pygments.formatters import HtmlFormatter
from pygments.style import Style
from pygments.token import Keyword, Name, Comment, String, Number, Operator, Punctuation

# Define a custom Pygments style
class CustomSQLStyle(Style):
    default_style = ""
    styles = {
        Comment: "italic #888888",  # Gray for comments
        Keyword: "bold #005cc5",  # Blue for keywords
        Name: "bold #22863a",  # Green for names
        String: "italic #d73a49",  # Red for strings
        Number: "bold #6f42c1",  # Purple for numbers
        Operator: "bold #e36209",  # Orange for operators
        Punctuation: "bold #24292e",  # Black for punctuation
    }

# Highlight the SQL code using Pygments with the custom style
formatter = HtmlFormatter(style=CustomSQLStyle, full=False, noclasses=True)
highlighted_sql = pygments.highlight(sql_code, SqlLexer(), formatter)

# Display the highlighted SQL code in the notebook
display(HTML(highlighted_sql))

## Execute the Query

In [18]:
import warnings
from pandas_gbq.exceptions import LargeResultsWarning

# Suppress the LargeResultsWarning
warnings.simplefilter('ignore', category=LargeResultsWarning)

# Import the pandas_gbq library
import pandas_gbq

# Define the SQL query
query = sql_code

# Read the data from BigQuery into a pandas DataFrame
Insights_Part_II = pandas_gbq.read_gbq(query, project_id=credentials.project_id, credentials=credentials)

# Display the first few rows of the dataframe
# print(Insights_Part_I.head())


Downloading: 100%|[32m██████████[0m|


### II.a. Largest MoM Decrease in Trips

#### ✔ Read Metric into Frame

In [19]:
# Filter the DataFrame for I.b insights (Largest Month-Over-Month Decrease in Fare Per Mile)
insights_iia = Insights_Part_II[Insights_Part_II['Metric_Description'] == 'II.a-Largest Month-Over-Month Decrease in Trips']

#### ✔ Show Table

##### ✔ Show as Markdown

In [20]:
# Call the function to display the table
display_markdown_table_ib(insights_iia)

### II.a-Largest Month-Over-Month Decrease in Trips

| Taxi Company | Trip Month | Insight Description |
|--------------|------------|----------------------|
| Flash Cab | 2014-11 | In Nov 2014, trips decreased by 365,892 (83.7%) compared to the previous month. |
| Taxi Affiliation Services | 2020-03 | In Mar 2020, trips decreased by 134,967 (47.8%) compared to the previous month. |
| Chicago Elite Cab Corp. | 2013-08 | In Aug 2013, trips decreased by 84,206 (60.1%) compared to the previous month. |


##### ✔ Show as Chart

In [21]:
from plotly import graph_objects as go  # Import Plotly graph_objects

# Create a new Plotly figure for the table
fig_iia = go.Figure()

# Add a table to the figure using the insights_iia DataFrame
fig_iia.add_trace(go.Table(
    header=dict(
        values=['<b>Taxi Company</b>', '<b>Trip Month</b>', '<b>Metric Insight</b>'],
        fill_color='grey',
        align='left',
        font=dict(size=12, color='white')
    ),
    cells=dict(
        values=[
            insights_iia['Taxi_Company'], 
            insights_iia['Trip_Month'], 
            insights_iia['Metric_Insight']
        ],
        fill_color='lightgrey',
        align='left',
        font=dict(size=11, color='black')
    )
))

# Update the layout to reduce white space below the table
fig_iia.update_layout(
    title={
        'text': f"<b>{insights_iia['Metric_Description'].iloc[0]}</b>",
        'y': 0.97,  # Position the title above the table
        'x': 0.5,   # Center the title
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=16)
    },
    autosize=True,
    width=1500,  # Set a fixed width to accommodate all columns
    height=400,  # Reduce the height to minimize white space
    margin=dict(l=10, r=10, t=50, b=10)  # Adjust margins for better spacing
)

# Adjust column widths to fit the data
fig_iia.data[0].columnwidth = [1.5, 1, 5]  # Set proportional widths for Taxi Company, Trip Month, and Metric Insight

# Display the interactive table
fig_iia.show()


#### ✔ Show Some Potential Resons for Metric Deltas

In [22]:
from IPython.display import Markdown, display

# Extract the first row of Metric_Reasons for reasons_iia
reasons_iia = insights_iia['Metric_Reasons'].iloc[0]
# Apply sentence case to each reason
formatted_reasons = '\n- '.join(reason.strip().capitalize() for reason in reasons_iia.split('\n- '))
# Format and display the reasons
formatted_reasons = f"### Potential Reasons for {insights_iia['Metric_Description'].iloc[0]}:\n- {formatted_reasons}"
display(Markdown(formatted_reasons))

### Potential Reasons for II.a-Largest Month-Over-Month Decrease in Trips:
- Potential reasons could include:
- Seasonal changes,
- Weather conditions,
- Reduced demand due to events,
- Operational challenges,
- Changes in regulations,
- Decreased fleet size,
- Increased competition,
- Changes in service offerings,
- Economic factors affecting ridership,
- Changes in customer preferences

#### ✔ Visualization

##### ✔ Visualization: Month-over-Month Decrease in Trips by Taxi Company (II.a Insights)

In [23]:
# - The x-axis represents the taxi companies.
# - The y-axis shows the number of trips in the current month (Mon_Metric_Val).
# - The bars are color-coded based on the absolute percentage change in trips (abs(Metric_Mon_Pct_Chg)).
# - The text on the bars displays the percentage change in trips compared to the previous month.
# - A darker red color indicates a larger percentage decrease in trips.
# - Use this graph to identify which taxi companies experienced the most significant drop in trips and the magnitude of that change.

import plotly.express as px

# Create a bar chart
fig = px.bar(
    insights_iia,
    x='Taxi_Company',
    y='Mon_Metric_Val',
    color=insights_iia['Metric_Mon_Pct_Chg'].abs(),  # Use absolute value for color scale
    text=insights_iia['Metric_Mon_Pct_Chg'].apply(lambda x: f"({abs(x * 100):.0f}%)" if x < 0 else f"{x * 100:.0f}%"),
    title='Month-over-Month Decrease in Trips by Taxi Company',
    labels={'Mon_Metric_Val': 'Current Month Trips', 'Metric_Mon_Pct_Chg': 'MoM Change (%)'},
    color_continuous_scale=['orange', 'red'],  # Transition from orange to red
    range_color=[insights_iia['Metric_Mon_Pct_Chg'].abs().min(), insights_iia['Metric_Mon_Pct_Chg'].abs().max()]
)

# Update layout for better readability
fig.update_layout(
    xaxis_title='Taxi Company',
    yaxis_title='Current Month Trips',
    coloraxis_colorbar=dict(
        title='ABS MoM Change (%)',
        tickvals=[val for val in insights_iia['Metric_Mon_Pct_Chg'].abs().unique()],
        ticktext=[f"{val * 100:.0f}%" for val in insights_iia['Metric_Mon_Pct_Chg'].abs().unique()]
    ),
    title={
        'text': 'Month-over-Month Decrease in Trips by Taxi Company',
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    legend_title_text='Metric Change',
    template='plotly_white'
)

# Show the chart
fig.show()

### ✔ II.b-Largest Month-Over-Month Increase in Fare Per Mile


#### ✔ Read Metric into Frame

In [24]:
# Filter the DataFrame for the desired metric insights
insights_iib = Insights_Part_II[Insights_Part_II['Metric_Description'] == 'II.b-Largest Month-Over-Month Increase in Fare Per Mile']

#### ✔ Show Table

##### ✔ Show as Markdown

In [25]:
# Call the function to display the table
display_markdown_table_ib(insights_iib)

### II.b-Largest Month-Over-Month Increase in Fare Per Mile

| Taxi Company | Trip Month | Insight Description |
|--------------|------------|----------------------|
| 4732 - Maude Lamy | 2015-10 | In Oct 2015, fare per mile increased by $21.33 (191.0%) compared to the previous month. |
| Suburban Dispatch LLC | 2015-08 | In Aug 2015, fare per mile increased by $23.69 (313.7%) compared to the previous month. |
| Metro Jet Taxi A. | 2022-06 | In Jun 2022, fare per mile increased by $37.81 (120.5%) compared to the previous month. |


#### ✔ Show as Chart

In [26]:
from plotly import graph_objects as go  # Import Plotly graph_objects

# Create a new Plotly figure for the table
fig_iib = go.Figure()

# Add a table to the figure using the insights_iia DataFrame
fig_iib.add_trace(go.Table(
    header=dict(
        values=['<b>Taxi Company</b>', '<b>Trip Month</b>', '<b>Metric Insight</b>'],
        fill_color='grey',
        align='left',
        font=dict(size=12, color='white')
    ),
    cells=dict(
        values=[
            insights_iib['Taxi_Company'], 
            insights_iib['Trip_Month'], 
            insights_iib['Metric_Insight']
        ],
        fill_color='lightgrey',
        align='left',
        font=dict(size=11, color='black')
    )
))

# Update the layout to reduce white space below the table
fig_iib.update_layout(
    title={
        'text': f"<b>{insights_iib['Metric_Description'].iloc[0]}</b>",
        'y': 0.97,  # Position the title above the table
        'x': 0.5,   # Center the title
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=16)
    },
    autosize=True,
    width=1500,  # Set a fixed width to accommodate all columns
    height=400,  # Reduce the height to minimize white space
    margin=dict(l=10, r=10, t=50, b=10)  # Adjust margins for better spacing
)

# Adjust column widths to fit the data
fig_iib.data[0].columnwidth = [1.5, 1, 5]  # Set proportional widths for Taxi Company, Trip Month, and Metric Insight

# Display the interactive table
fig_iib.show()


#### ✔ Show Some Potential Resons for Metric Deltas

In [27]:
from IPython.display import Markdown, display

# Extract the first row of Metric_Reasons for reasons_iia
reasons_iib = insights_iib['Metric_Reasons'].iloc[0]
# Apply sentence case to each reason
formatted_reasons = '\n- '.join(reason.strip().capitalize() for reason in reasons_iib.split('\n- '))
# Format and display the reasons
formatted_reasons = f"### Potential Reasons for {insights_iib['Metric_Description'].iloc[0]}:\n- {formatted_reasons}"
display(Markdown(formatted_reasons))

### Potential Reasons for II.b-Largest Month-Over-Month Increase in Fare Per Mile:
- Potential reasons could include:
- Increased demand for premium services,
- Adjustments to pricing models,
- Reduced availability of discounts or promotions,
- Longer trips with higher fares,
- Seasonal or event-driven demand surges,
- Operational changes to optimize revenue,
- Changes in trip patterns or routes,
- Economic factors influencing pricing,
- Reduced competition or market consolidation.

#### ✔ Visualization

##### ✔ Visualization: Percent Difference from Average (II.b Insights)

In [28]:
# This visualization provides insights into the increase in fare per mile for each taxi company.
# - The x-axis represents the taxi companies.
# - The y-axis shows the increase in fare per mile.
# - The bars are color-coded to indicate the magnitude of the increase:
#   - Blue for small increases, transitioning to green for large increases.
# - The text on the bars displays the exact increase in fare per mile for better readability.
# - This chart helps identify which taxi companies experienced significant increases in fare per mile.

import plotly.express as px

# Ensure Metric_Delta is numeric using .loc to avoid SettingWithCopyWarning
insights_iib.loc[:, 'Metric_Delta'] = insights_iib['Metric_Delta'].astype(float)

# Create a bar chart for Metric_Delta
fig_fare_increase = px.bar(
    insights_iib,
    x='Taxi_Company',
    y='Metric_Delta',
    color='Metric_Delta',
    text=insights_iib['Metric_Delta'].round(2).astype(str),
    title='Increase in Fare Per Mile (by Company) (II.b Insights)',
    labels={
        'Taxi_Company': 'Taxi Company',
        'Metric_Delta': 'Increase in Fare Per Mile ($)'
    },
    color_continuous_scale=['blue', 'green'],  # Blue for small, green for large
    range_color=[0, insights_iib['Metric_Delta'].max()],
    template='plotly_white'
)

# Update layout for better readability and adjust y-axis range
fig_fare_increase.update_layout(
    xaxis_title='Taxi Company',
    yaxis_title='Increase in Fare Per Mile ($)',
    yaxis=dict(range=[0, insights_iib['Metric_Delta'].max() * 1.1]),
    title={
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    coloraxis_colorbar=dict(
        title='Increase in Fare ($)',
        tickvals=[0, insights_iib['Metric_Delta'].max() / 2, insights_iib['Metric_Delta'].max()],
        ticktext=['Small', 'Moderate', 'Large']
    )
)

# Add text formatting for better readability
fig_fare_increase.update_traces(textposition='outside')

# Show the chart
fig_fare_increase.show()

# PART II Extra Analysis - TBD

#### Some Info About the Dataset
This query analyzes Chicago taxi trip data by aggregating and joining it with geographic information with City info like Census Pop. 
    It calculates various metrics such as average speed, revenue, and trip density, and includes window functions 
    to rank and compare performance across cities and companies.

In [29]:
# Define the path to the SQL file
# Define the file name as a variable for easier maintenance
file_name = 'Chi_Taxi_Trip_Insights_Branch_Part_II_Extra.bqsql'

# Dynamically construct the absolute path to the SQL file
sql_file_path = os.path.abspath(os.path.join('C:\\Users\\mike\\Develop\\Projects\\Code Notebook\\Datasets\\Chicago Taxi Trips', file_name))

# Check if the file exists and load the SQL code into a variable
if os.path.exists(sql_file_path):
    with open(sql_file_path, 'r') as sql_file:
        sql_code = sql_file.read()
    print("SQL code loaded successfully.")
else:
    print(f"File not found: {sql_file_path}")

SQL code loaded successfully.


In [30]:
from IPython.display import display, HTML
import pygments
from pygments.lexers import SqlLexer
from pygments.formatters import HtmlFormatter
from pygments.style import Style
from pygments.token import Keyword, Name, Comment, String, Number, Operator, Punctuation

# Define a custom Pygments style
class CustomSQLStyle(Style):
    default_style = ""
    styles = {
        Comment: "italic #888888",  # Gray for comments
        Keyword: "bold #005cc5",  # Blue for keywords
        Name: "bold #22863a",  # Green for names
        String: "italic #d73a49",  # Red for strings
        Number: "bold #6f42c1",  # Purple for numbers
        Operator: "bold #e36209",  # Orange for operators
        Punctuation: "bold #24292e",  # Black for punctuation
    }

# Highlight the SQL code using Pygments with the custom style
formatter = HtmlFormatter(style=CustomSQLStyle, full=False, noclasses=True)
highlighted_sql = pygments.highlight(sql_code, SqlLexer(), formatter)

# Display the highlighted SQL code in the notebook
display(HTML(highlighted_sql))

## ✔ Execute the Query

In [31]:
import warnings
from pandas_gbq.exceptions import LargeResultsWarning

# Suppress the LargeResultsWarning
warnings.simplefilter('ignore', category=LargeResultsWarning)

# Import the pandas_gbq library
import pandas_gbq
import os
import pandas as pd

# Define the SQL query
query = sql_code

# Check if the Parquet file exists
parquet_file_path = 'Insights_Part_III.parquet'

if os.path.exists(parquet_file_path):
    # Read the data from the Parquet file
    Insights_Part_III = pd.read_parquet(parquet_file_path)
    print("Data loaded from Parquet file.")
else:
    # Read the data from BigQuery into a pandas DataFrame
    Insights_Part_III = pandas_gbq.read_gbq(query, project_id=credentials.project_id, credentials=credentials)
    print("Data loaded from BigQuery.")

# Display the first few rows of the dataframe
print(Insights_Part_III.head())


Data loaded from Parquet file.
               trip_quarter taxi_company             city  city_population  \
0 2023-10-01 00:00:00+00:00    Flash Cab  Near North Side          85711.0   
1 2023-10-01 00:00:00+00:00    Flash Cab     Lincoln Park          66959.0   
2 2023-10-01 00:00:00+00:00    Flash Cab     Chicago Loop          33442.0   
3 2023-10-01 00:00:00+00:00    Flash Cab  Lower West Side          34410.0   
4 2023-10-01 00:00:00+00:00    Flash Cab  Near South Side          22401.0   

   total_trips  unique_taxis  total_trip_seconds  avg_trip_seconds  \
0       167235           457           161349311        964.805878   
1       163872           456           157591794        961.676150   
2       155684           457           150731141        968.186461   
3       151000           458           148884170        985.987881   
4       149599           458           147309305        984.694450   

   total_trip_miles  avg_trip_miles  ...  revenue_rank_within_city  \
0        

### Save to Results to a Local Parquet File
for faster loading

In [32]:
# Save insights_ia DataFrame to a Parquet file
Insights_Part_III.to_parquet('Insights_Part_III.parquet', index=False)


print("DataFrames have been saved to Parquet files successfully.")

DataFrames have been saved to Parquet files successfully.
