In [1]:
import pandas as pd
from IPython.display import HTML, display
from tabulate import tabulate

project_title = """
<div style="border-bottom: 2px solid black; margin-bottom: 20px; padding-bottom: 10px;">
    <h1 style="text-align: center; font-size: 50px">Dog Recommender App</h1>
    <p style="text-align: center">A simple app to recommend dog breeds based on character traits.</p>
    <p style="text-align: center">By Marisa Espinoza</p>
</div>
"""
display(HTML(project_title))

# data loading and processing
breeds = pd.read_csv('data/dog_breeds.csv')
breeds = breeds.drop_duplicates(subset=['Breed'])
breeds['Character Traits'] = breeds['Character Traits'].str.lower().str.strip()
traits_df = breeds['Character Traits'].str.get_dummies(sep=', ')

breeds = pd.concat([breeds, traits_df], axis=1)

# calculate average height and categorize size
breeds[['MinHeight (in)', 'MaxHeight (in)']] = breeds['Height (in)'].str.split('-', expand=True)
breeds['MinHeight (in)'] = breeds['MinHeight (in)'].astype(float)
breeds['MaxHeight (in)'] = breeds['MaxHeight (in)'].astype(float)
breeds['AvgHeight (in)'] = breeds[['MinHeight (in)', 'MaxHeight (in)']].mean(axis=1)

def categorize_height(h):
    if h < 12:
        return "Extra-Small"
    elif h < 16:
        return "Small"
    elif h < 22:
        return "Medium"
    elif h < 28:
        return "Large"
    else:
        return "Extra-Large"

breeds['Size Category'] = breeds['AvgHeight (in)'].apply(categorize_height)

# drop unneeded columns
breeds = breeds.drop(['Country of Origin', 'Height (in)', 'MinHeight (in)', 'MaxHeight (in)'], axis=1)

# print(tabulate(breeds, headers='keys', tablefmt='psql', showindex=False))


In [2]:
import plotly.express as px
import itertools
from plotly.subplots import make_subplots

html_content = """
<div>
    <h2>Dog Breed Traits Visual Analysis</h2>
    <p>This section provides visual insights into the character traits of various dog breeds.</p>
</div>
"""
display(HTML(html_content))

# visualizations

# Top 10 Common Traits
trait_counts = traits_df.sum().sort_values(ascending=False).head(10)
trait_counts_df = trait_counts.reset_index()
trait_counts_df.columns = ['Trait', 'Count']

fig = px.bar(
    trait_counts_df, 
    x='Trait', 
    y='Count',
    title="Top 10 Most Common Traits",
    color='Trait',
    color_discrete_sequence=px.colors.qualitative.D3
)

fig.update_layout(
    plot_bgcolor= "rgba(0, 0, 0, 0.1)",
    font=dict(color="black"),
    title_x=0.5,
    title_font=dict(size=20, weight='bold'),
    showlegend=True,
    xaxis=dict(showline=True, linewidth=1, linecolor='black', mirror=True),
    yaxis=dict(showline=True, linewidth=1, linecolor='black', mirror=True),
    xaxis2=dict(showline=True, linewidth=1, linecolor='black', mirror=True),
    yaxis2=dict(showline=True, linewidth=1, linecolor='black', mirror=True),
    height=500,
    width=1000
)
fig.show()

# -----------------------------------------------------------
# Top Traits by Size Category

# define size order
size_order = ["Extra-Small", "Small", "Medium", "Large", "Extra-Large"]

# top 3 raw count dataframe
traits_by_size = breeds.groupby('Size Category')[traits_df.columns].sum()
top_traits_df = traits_by_size.reset_index().melt(
    id_vars="Size Category",
    var_name="Trait",
    value_name="Count"
)
top_traits_df = top_traits_df.sort_values(['Size Category', 'Count'], ascending=[True, False]).groupby('Size Category').head(3)

# top 3 percentage dataframe
size_counts = breeds['Size Category'].value_counts()
traits_by_size_pct = traits_by_size.div(size_counts, axis=0)

top_traits_pct_df = traits_by_size_pct.reset_index().melt(
    id_vars="Size Category", 
    var_name="Trait", 
    value_name="Percentage"
)
top_traits_pct_df = top_traits_pct_df.sort_values(["Size Category", "Percentage"], ascending=[True, False]).groupby("Size Category").head(3)

# get all unique traits across all sizes
all_traits = sorted(top_traits_df["Trait"].unique())

# create a full combo of all sizes and traits
full_combos = pd.DataFrame(itertools.product(size_order, all_traits), columns=["Size Category", "Trait"])

# merge and fill missing traits with 0
top_traits_df_full = (
    full_combos
    .merge(top_traits_df, on=["Size Category", "Trait"], how="left")
    .fillna(0)
)

top_traits_pct_df_full = (
    full_combos
    .merge(top_traits_pct_df, on=["Size Category", "Trait"], how="left")
    .fillna(0)
)

# remove rows where count or percentage is 0
top_traits_df = top_traits_df_full[top_traits_df_full['Count'] != 0]
top_traits_pct_df = top_traits_pct_df_full[top_traits_pct_df_full['Percentage'] != 0]

# plot counts
fig_counts = px.bar(
    top_traits_df,
    x="Size Category",
    y="Count",
    color="Trait",
    title="Top Traits per Size Category (Raw Counts)",
    category_orders={"Size Category": size_order},
    barmode="group",
    color_discrete_sequence=px.colors.qualitative.D3
)

# plot percentages
fig_pct = px.bar(
    top_traits_pct_df,
    x="Size Category",
    y="Percentage",
    color="Trait",
    title="Top Traits per Size Category (Percentage)",
    category_orders={"Size Category": size_order},
    barmode="group",
    color_discrete_sequence=px.colors.qualitative.D3
)

# subplot with both charts
fig = make_subplots(rows=1, cols=2, subplot_titles=("Raw Counts", "Percentages"))

for trace in fig_counts.data:
    fig.add_trace(trace, row=1, col=1)

for trace in fig_pct.data:
    trace.showlegend = False
    fig.add_trace(trace, row=1, col=2)

fig.update_layout(
    plot_bgcolor= "rgba(0, 0, 0, 0.1)",
    font=dict(color="black"),
    title=dict(text="Top 3 Traits per Breed Size", x=0.5),
    title_font=dict(size=20, weight='bold'),
    showlegend=True,
    legend_title_text="Trait",
    xaxis=dict(showline=True, linewidth=1, linecolor='black', mirror=True),
    yaxis=dict(showline=True, linewidth=1, linecolor='black', mirror=True),
    xaxis2=dict(showline=True, linewidth=1, linecolor='black', mirror=True),
    yaxis2=dict(showline=True, linewidth=1, linecolor='black', mirror=True),
    bargap=0.2
)

fig.update_xaxes(title_text="Size Category", categoryorder='array', categoryarray=size_order, row=1, col=1)
fig.update_xaxes(title_text="Size Category", categoryorder='array', categoryarray=size_order, row=1, col=2)

fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_yaxes(title_text="Percentage", tickformat=".0%", row=1, col=2)

fig.show()

# -----------------------------------------------------------
# Trait Co-occurrence Heatmap
correlation_map = traits_df.corr()
fig = px.imshow(
    correlation_map,
    color_continuous_scale='RdBu_r',
    aspect="auto",
    zmin=-1, zmax=1,
)

fig.update_traces(
    xgap=1,
    ygap=1,
    selector=dict(type='heatmap'),
    hovertemplate="Trait 1: %{x}<br>Trait 2: %{y}<br>Correlation Coefficient: %{z:.4f}<extra></extra>"
)

fig.update_layout(
    plot_bgcolor= "rgba(0, 0, 0, 0)",
    font=dict(color="black"),
    title=dict(
        text="Trait Co-occurrence Correlation<br><span style='font-size:14px;'>How often two traits occur together across breeds</span>",
        x=0.5,
        xanchor='center',
        yanchor='top',
        font=dict(size=20, weight='bold'),
    ),
    coloraxis_colorbar=dict(
        title="Correlation",
        lenmode="fraction",
        len=0.507,
        thicknessmode="pixels",
        thickness=20,
        y=0.5
    ),
    xaxis=dict(showline=True, linewidth=1, linecolor='black', mirror=True),
    yaxis=dict(showline=True, linewidth=1, linecolor='black', mirror=True),
    height=700,
    width=1000,
    margin=dict(t=100)
)

fig.show()


In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets

app_text = """
<div style="border-top: 2px solid black; margin-top: 20px; padding-top: 10px;">
    <h2>Find your perfect match!</h1>
    <p>Select the traits you're most interested in having for your future four-legged friend. Then, we will provide the top 10 dog breeds that match what you're looking for.</p>
</div>
"""
display(HTML(app_text))

# recommend breeds ML function
def recommend_breeds(input_traits, traits_df, breeds_df, top_n=10):
    input_vector = pd.Series(0, index=traits_df.columns)
    for trait in input_traits:
        trait = trait.lower().strip()
        if trait in input_vector.index:
            input_vector[trait] = 1
    input_vector = input_vector.values.reshape(1, -1)
    
    breed_matrix = traits_df.values
    similarity_scores = cosine_similarity(input_vector, breed_matrix).flatten()
    
    similarity_df = pd.DataFrame({
        'Breed': breeds_df['Breed'],
        'Similarity': similarity_scores
    })
    
    top_matches = similarity_df.sort_values(by='Similarity', ascending=False).head(top_n)
    
    return top_matches

# widgets
# checkboxes for each trait
traits=(traits_df.columns).to_list()

checkboxes = [widgets.Checkbox(value=False, description=trait) for trait in traits]
checkbox_grid = widgets.GridBox(
    checkboxes, 
    layout=widgets.Layout(
        grid_template_columns="repeat(4, 200px)", 
        grid_gap="10px 20px", 
        overflow='visible', 
        margin='0px 0px 20px 0px'
    )
)

# submit button
submit_button = widgets.Button(
    description="SUBMIT", 
    button_style='primary', 
    layout=widgets.Layout(margin='0px 0px 20px 0px'))

output = widgets.Output()

# submit button click event
def reset_checkboxes():
    for cb in checkboxes:
        cb.value = False

def on_submit_button_clicked(b):
    with output:
        output.clear_output()
        selected_traits = [cb.description for cb in checkboxes if cb.value]
        if not selected_traits:
            print("Please select at least one trait.")
            return
        recommendations = recommend_breeds(selected_traits, traits_df, breeds, top_n=10)

        heading = widgets.HTML("<h3 style='text-align:center;'>Here is a list of dog breeds that match your preferences:</h3>")
        selected_traits_output = widgets.HTML(f"<p style='text-align:center;'><b>You selected the following traits:</b> {', '.join(selected_traits)}</p>")
        table_widget = widgets.HTML(recommendations.to_html(index=False, border=0))
        
        display(widgets.VBox([
            selected_traits_output,
            heading, table_widget
            ], 
            layout=widgets.Layout(
                align_items='center', 
                width='100%'
                )
            )
        )
        reset_checkboxes()

submit_button.on_click(on_submit_button_clicked)

# display app
display(widgets.VBox([
    widgets.HTML("<h2>I'm looking for a dog that is:</h2>"),
    checkbox_grid,
    submit_button, 
    output
    ], 
    layout=widgets.Layout(
        align_items='center', 
        width='100%', 
        margin='20px 0px 40px 0px'
        )
    )
)



VBox(children=(HTML(value="<h2>I'm looking for a dog that is:</h2>"), GridBox(children=(Checkbox(value=False, â€¦