**Project by** Dinis Pinto (20240612), Joana Rodrigues (20240603), João Marques (20240656), and Mara Simões (20240326) - **Group 27**.

# APP

## EDA

In [1]:
pip install dash pandas plotly numpy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import dash
from dash import dcc, html, Input, Output, State
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans




app = dash.Dash(__name__, suppress_callback_exceptions=True)



def load_data(preprocessed=False):
    """Load the dataset, either raw or preprocessed."""
    if preprocessed:
        df = pd.read_csv('Preprocessed_ABCDEats_DATASET.csv') 
    else:
        df = pd.read_csv('DM2425_ABCDEats_DATASET.csv')
        
        # Preprocessing steps if raw dataset is loaded
        df['main_region'] = df['customer_region'].astype(str).str[0]
        df['customer_age'].fillna(df['customer_age'].mean(), inplace=True)
        df['HR_0'].fillna(0, inplace=True)
        df['first_order'].fillna(df['first_order'].mean(), inplace=True)
        
        dow_cols = [f'DOW_{i}' for i in range(7)]
        df['total_DOW'] = df[dow_cols].sum(axis=1)
    
    return df

def enhance_rfm_analysis(df):
    """Enhanced RFM analysis with clustering"""
    
    # Calculate RFM metrics
    rfm_data = pd.DataFrame({
        'recency': df['last_order'],
        'frequency': df['daysweek'],
        'monetary': df[df.columns[df.columns.str.startswith('CUI_')]].sum(axis=1)
    })
    
    # Standardize for clustering
    scaler = StandardScaler()
    rfm_scaled = scaler.fit_transform(rfm_data)
    
    # Apply clustering
    kmeans = KMeans(n_clusters=4, random_state=42)
    rfm_data['Segment'] = kmeans.fit_predict(rfm_scaled)
    
    return rfm_data

def analyze_cuisine_time_patterns(df):
    """Analyze cuisine ordering patterns by time"""
    cuisine_cols = [col for col in df.columns if col.startswith('CUI_')]
    hour_cols = [f'HR_{i}' for i in range(24)]
    
    cuisine_patterns = {}
    for cuisine in cuisine_cols:
        cuisine_orders = df[df[cuisine] > 0][hour_cols].sum()
        cuisine_patterns[cuisine.replace('CUI_', '')] = cuisine_orders
    
    return pd.DataFrame(cuisine_patterns)


def enhance_time_pattern_analysis(df):
    """Analyze detailed time patterns for different segments"""
    # Calculate peak hours by segment
    segment_hours = {}
    for segment in ['high-value', 'regular', 'occasional']:
        if segment == 'high-value':
            mask = df[df.columns[df.columns.str.startswith('CUI_')]].sum(axis=1) > 50
        elif segment == 'regular':
            mask = (df[df.columns[df.columns.str.startswith('CUI_')]].sum(axis=1) > 20) & \
                  (df[df.columns[df.columns.str.startswith('CUI_')]].sum(axis=1) <= 50)
        else:
            mask = df[df.columns[df.columns.str.startswith('CUI_')]].sum(axis=1) <= 20
            
        segment_df = df[mask]
        hour_cols = [f'HR_{i}' for i in range(24)]
        segment_hours[segment] = segment_df[hour_cols].sum()
    
    return segment_hours

def calculate_cuisine_correlations(df):
    """Calculate correlations between cuisine types"""
    cuisine_cols = [col for col in df.columns if col.startswith('CUI_')]
    correlations = df[cuisine_cols].corr()
    return correlations


cuisine_data = pd.DataFrame({
    'cuisine': ['Asian', 'American', 'Italian', 'Desserts', 'Indian', 'Others'],
    'chains': [15, 14, 8, 7, 3, 3],
    'independent': [13, 8, 10, 5, 7, 7]
})

# percentages for restaurant types
cuisine_data['total'] = cuisine_data['chains'] + cuisine_data['independent']
cuisine_data['chains_pct'] = (cuisine_data['chains'] / cuisine_data['total'] * 100).round(1)
cuisine_data['independent_pct'] = (cuisine_data['independent'] / cuisine_data['total'] * 100).round(1)

def calculate_rfm_metrics(df):
    """Calculate RFM metrics for visualization"""
    rfm_metrics = {
        'recency': df['last_order'].describe(),
        'frequency': df['total_DOW'].describe(), 
        'monetary': df[df.columns[df.columns.str.startswith('CUI_')]].sum(axis=1).describe()
    }
    return rfm_metrics


def get_segment_characteristics():
    """Return segment characteristics"""
    return {
        'High Value': {
            'avg_order': 75,
            'orders_month': 8.5,
            'promo_usage': 65,
            'top_cuisines': ['Cafe/Desserts', 'Asian', 'American'],
            'description': 'Customers who order frequently and spend significantly more per order. They show high engagement with promotions and prefer premium dining options.'
        },
        'Regular': {
            'avg_order': 45,
            'orders_month': 4.2,
            'promo_usage': 45,
            'top_cuisines': ['Asian', 'American', 'Italian'],
            'description': 'Consistent customers with moderate spending habits. They order regularly but are more price-sensitive than high-value customers.'
        },
        'Occasional': {
            'avg_order': 25,
            'orders_month': 1.8,
            'promo_usage': 25,
            'top_cuisines': ['Fast Food', 'Asian', 'American'],
            'description': 'Infrequent customers with lower average order values. They tend to be more price-sensitive and often order during promotions.'
        }
    }

def create_layout():
    df = load_data()
    regions = ['2', '4', '8'] 
    
    return html.Div([
        html.H1("ABCDEats Customer Analysis Dashboard", 
                style={'textAlign': 'center', 'padding': '20px'}),
        
        dcc.Tabs([
            # Tab 1: Time Patterns
            dcc.Tab(label='Time Patterns', children=[
                html.Div([
                    html.Div([
                        html.H3("Daily Order Distribution"),
                        dcc.Graph(id='daily-pattern'),
                        html.H3("Hourly Order Distribution"),
                        dcc.Graph(id='hourly-pattern'),
                        html.Div(id='time-insights', 
                                style={'padding': '20px', 'backgroundColor': '#f0f0f0', 'margin': '10px'}),
                        html.H3("Cuisine Order Patterns Analysis", 
                                style={'textAlign': 'center', 'marginTop': '30px'}),
                        html.Div([
                            # Left comparison section
                            html.Div([
                                html.H4("Chart 1"),
                                dcc.Dropdown(
                                    id='cuisine-dropdown-1',
                                    options=[],
                                    value=None,
                                    placeholder="Select primary cuisine"
                                ),
                                dcc.Dropdown(
                                    id='cuisine-dropdown-1-compare',
                                    options=[],
                                    value=None,
                                    placeholder="Select cuisine to compare (optional)",
                                    style={'marginTop': '10px'}
                                ),
                                dcc.Graph(id='cuisine-pattern-1'),
                                html.Div(id='cuisine-insights-1',
                                        style={'padding': '15px', 'backgroundColor': '#f8f9fa', 
                                            'borderRadius': '5px', 'marginTop': '10px'})
                            ], style={'width': '48%', 'display': 'inline-block'}),
                            
                            # Right comparison section
                            html.Div([
                                html.H4("Chart 2"),
                                dcc.Dropdown(
                                    id='cuisine-dropdown-2',
                                    options=[],
                                    value=None,
                                    placeholder="Select primary cuisine"
                                ),
                                dcc.Dropdown(
                                    id='cuisine-dropdown-2-compare',
                                    options=[],
                                    value=None,
                                    placeholder="Select cuisine to compare (optional)",
                                    style={'marginTop': '10px'}
                                ),
                                dcc.Graph(id='cuisine-pattern-2'),
                                html.Div(id='cuisine-insights-2',
                                        style={'padding': '15px', 'backgroundColor': '#f8f9fa', 
                                            'borderRadius': '5px', 'marginTop': '10px'})
                            ], style={'width': '48%', 'display': 'inline-block', 'float': 'right'})
                        ])
                    ], style={'width': '75%', 'display': 'inline-block', 'vertical-align': 'top'}),
                    
                    html.Div([
                        html.H3("Filters"),
                        html.Label("Select Day Range:"),
                        dcc.RangeSlider(
                            id='day-range',
                            min=0,
                            max=6,
                            step=1,
                            marks={0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 
                                  4: 'Fri', 5: 'Sat', 6: 'Sun'},
                            value=[0, 6]
                        ),
                        html.Br(),
                        html.Label("Select Hour Range:"),
                        dcc.RangeSlider(
                            id='hour-range',
                            min=0,
                            max=23,
                            step=1,
                            marks={i: f'{i}:00' for i in range(0, 24, 4)},
                            value=[0, 23]
                        ),
                        html.Br(),
                        html.Label("Select Region:"),
                        dcc.Dropdown(
                            id='time-region',
                            options=[{'label': f'Region {r}', 'value': r} for r in regions],
                            value='all',
                            multi=True
                        )
                    ], style={'width': '20%', 'float': 'right', 'padding': '20px'})
                ])
            ]),
            
            # Tab 2: Cuisine Analysis
            dcc.Tab(label='Cuisine Analysis', children=[
                html.Div([
                    html.Div([
                        html.H3("Regional Cuisine Preferences"),
                        dcc.Graph(id='cuisine-comparison'),
                        html.H3("Cuisine Distribution"),
                        dcc.Graph(id='cuisine-distribution')
                    ], style={'width': '75%', 'display': 'inline-block'}),
                    
                    html.Div([
                        html.H3("Analysis Controls"),
                        html.Label("Select Regions to Compare:"),
                        dcc.Dropdown(
                            id='cuisine-regions',
                            options=[{'label': f'Region {r}', 'value': r} for r in regions],
                            value=[regions[0], regions[1]],
                            multi=True
                        ),
                        html.Br(),
                        html.Label("Value Type:"),
                        dcc.RadioItems(
                            id='cuisine-value-type',
                            options=[
                                {'label': 'Absolute Values', 'value': 'absolute'},
                                {'label': 'Relative Values (100%)', 'value': 'relative'}
                            ],
                            value='absolute',
                            style={'margin': '10px 0'}
                        ),
                        html.Label("Select Visualization Type:"),
                        dcc.RadioItems(
                            id='cuisine-viz-type',
                            options=[
                                {'label': 'Bar Chart', 'value': 'bar'},
                                {'label': 'Radar Chart', 'value': 'radar'},
                                {'label': 'Heatmap', 'value': 'heatmap'}
                            ],
                            value='bar'
                        ),
                        html.Br(),
                        html.Label("Top N Cuisines:"),
                        dcc.Slider(
                            id='top-n-cuisines',
                            min=5,
                            max=15,
                            step=1,
                            value=10,
                            marks={i: str(i) for i in range(5, 16, 5)}
                        )
                    ], style={'width': '20%', 'float': 'right', 'padding': '20px'}),

                    html.Div([
                        html.H3('Cuisine Correlations'),
                        dcc.Slider(
                            id='cuisine-correlation-threshold',
                            min=0, max=1, step=0.1, value=0.5,
                            marks={i/10: str(i/10) for i in range(11)}
                        ),
                        dcc.Graph(id='cuisine-correlation-heatmap')
                    ], style={'marginBottom': '30px'}),

                    html.Div([
                        html.H3('Time Pattern Analysis'),
                        dcc.Dropdown(
                            id='cuisine-selection',
                            multi=True,
                            placeholder='Select cuisines to compare',
                            value=None  
                        ),
                        dcc.RadioItems(
                            id='time-aggregation',
                            options=[
                                {'label': 'Hourly', 'value': 'hour'},
                                {'label': 'Day Parts', 'value': 'daypart'}
                            ],
                            value='hour'
                        ),
                        dcc.Graph(id='time-pattern-analysis')
                    ])
                ])
            ]),
            
            # Tab 3: Restaurant Types
            dcc.Tab(label='Restaurant Types', children=[
                html.Div([
                    html.H3('Restaurant Type Distribution'),
                    html.Div([
                        html.H4('Visualization Type:'),
                        dcc.RadioItems(
                            id='visualization-type',
                            options=[
                                {'label': 'Absolute Values', 'value': 'absolute'},
                                {'label': 'Relative Values (100%)', 'value': 'relative'}
                            ],
                            value='absolute',
                            style={'margin': '10px 0'}
                        )
                    ]),
                    dcc.Graph(id='cuisine-graph'),
                    html.Div([
                        html.H4('Restaurant Type Filter:'),
                        dcc.Dropdown(
                            id='restaurant-type',
                            options=[
                                {'label': 'All', 'value': 'all'},
                                {'label': 'Chain Restaurants', 'value': 'chains'},
                                {'label': 'Independent Restaurants', 'value': 'independent'}
                            ],
                            value='all'
                        )
                    ], style={'width': '50%', 'margin': '20px auto'})
                ])
            ]),

                # Tab 4: Customer Segments
                dcc.Tab(label='Customer Segments', children=[
                    html.Div([
                        html.H3('Customer Segmentation Analysis', 
                                style={'textAlign': 'center', 'marginBottom': '20px'}),

                        # RFM plot before the individual distributions (overview of all RFM metrics together before showing the individual breakdowns)
                        html.Div([
                            html.H4('Combined RFM Analysis', style={'marginBottom': '15px'}),
                            dcc.Graph(id='rfm-plot')  
                        ], style={'marginBottom': '30px'}),
                        
                        # RFM Overview Cards
                        html.Div([
                            html.Div([
                                html.H4('Recent Customers', style={'marginBottom': '10px'}),
                                html.P('Orders in last 10 days'),
                                html.H2('47.5%', style={'color': '#4a90e2', 'marginBottom': '15px'}),
                                dcc.Graph(id='recency-dist')
                            ], className='segment-card', style={'width': '30%', 'padding': '20px', 'boxShadow': '0px 0px 10px rgba(0,0,0,0.1)'}),
                            html.Div([
                                html.H4('Frequent Customers', style={'marginBottom': '10px'}),
                                html.P('More than 4 orders'),
                                html.H2('33.3%', style={'color': '#4a90e2', 'marginBottom': '15px'}),
                                dcc.Graph(id='frequency-dist')
                            ], className='segment-card', style={'width': '30%', 'padding': '20px', 'boxShadow': '0px 0px 10px rgba(0,0,0,0.1)'}),
                            html.Div([
                                html.H4('High Value Customers', style={'marginBottom': '10px'}),
                                html.P('Spent more than 50 m.u.'),
                                html.H2('28.2%', style={'color': '#4a90e2', 'marginBottom': '15px'}),
                                dcc.Graph(id='monetary-dist')
                            ], className='segment-card', style={'width': '30%', 'padding': '20px', 'boxShadow': '0px 0px 10px rgba(0,0,0,0.1)'})
                        ], style={'display': 'flex', 'justifyContent': 'space-between', 'margin': '20px'}),
                        
                        # Segment Distribution and Comparison
                        html.Div([
                            html.Div([
                                html.H4('Overall Segment Distribution'),
                                dcc.Graph(id='segment-pie-chart')
                            ], style={'width': '48%', 'display': 'inline-block'}),
                            
                            html.Div([
                                html.H4('Segment Metrics Comparison'),
                                dcc.Graph(id='segment-comparison-chart')
                            ], style={'width': '48%', 'display': 'inline-block', 'float': 'right'})
                        ], style={'margin': '20px 0'}),
                        
                        # Segment Details Section
                        html.Div([
                            html.H4('Segment Details', 
                                style={'marginTop': '30px', 'marginBottom': '20px'}),
                            dcc.Tabs(id='segment-detail-tabs', value='high-value', children=[
                                dcc.Tab(label='High Value', value='high-value'),
                                dcc.Tab(label='Regular', value='regular'),
                                dcc.Tab(label='Occasional', value='occasional'),
                            ]),
                            html.Div(id='segment-detail-content', style={'marginTop': '1rem'})
                        ], style={'marginTop': '1rem'}),
                        
                        # Customer Segment Simulator
                        html.Div([
                            html.H4('Customer Segment Simulator', 
                                style={'marginTop': '30px', 'marginBottom': '20px'}),
                            html.Div([
                                html.Div([
                                    html.Label('Recency (days since last order):'),
                                    dcc.Slider(
                                        id='recency-slider',
                                        min=0,
                                        max=90,
                                        value=30,
                                        marks={i: str(i) for i in range(0, 91, 15)}
                                    )
                                ], style={'marginBottom': '20px'}),
                                html.Div([
                                    html.Label('Frequency (orders per month):'),
                                    dcc.Slider(
                                        id='frequency-slider',
                                        min=0,
                                        max=20,
                                        value=4,
                                        marks={i: str(i) for i in range(0, 21, 5)}
                                    )
                                ], style={'marginBottom': '20px'}),
                                html.Div([
                                    html.Label('Monetary (average order value):'),
                                    dcc.Slider(
                                        id='monetary-slider',
                                        min=0,
                                        max=200,
                                        value=50,
                                        marks={i: str(i) for i in range(0, 201, 50)}
                                    )
                                ]),
                                html.Div(id='segment-prediction', 
                                        style={'marginTop': '20px', 'padding': '15px', 
                                            'backgroundColor': '#f0f0f0', 'borderRadius': '5px'})
                            ])
                        ], style={'margin': '20px', 'padding': '20px', 'border': '1px solid #ddd'})
                    ])
                ])
            ])
        ])


@app.callback(
    Output('rfm-plot', 'figure'),
    Input('dummy-input', 'children')
)
def update_rfm_plot(_):
    df = load_data(preprocessed=True)  
    rfm_data = enhance_rfm_analysis(df)  
    
    fig = go.Figure()
    
    # Traces for each RFM metric 
    for metric in ['recency', 'frequency', 'monetary']:
        fig.add_trace(go.Histogram(
            x=rfm_data[metric],
            name=metric.capitalize(),
            nbinsx=20,
            opacity=0.75
        ))
    
    fig.update_layout(
        title='RFM Distributions',
        barmode='overlay',
        height=400,
        showlegend=True,
        plot_bgcolor='white'
    )
    
    return fig

@app.callback(
    Output('segment-overview-plot', 'figure'),
    Input('dummy-input', 'children')
)
def update_segment_overview(dummy):
    # Sample segment data
    segments = ['High Value', 'Regular', 'Occasional']
    metrics = {
        'Average Order Value': [75, 45, 25],
        'Orders per Month': [8.5, 4.2, 1.8],
        'Retention Rate %': [85, 65, 40]
    }
    
    fig = go.Figure()
    
    # bars for each metric
    colors = ['#2ecc71', '#3498db', '#e74c3c']
    for i, (metric, values) in enumerate(metrics.items()):
        fig.add_trace(go.Bar(
            name=metric,
            x=segments,
            y=values,
            marker_color=colors[i]
        ))
    
    fig.update_layout(
        title='Segment Comparison',
        barmode='group',
        height=400,
        plot_bgcolor='white',
        showlegend=True
    )
    
    return fig

@app.callback(
    Output('segment-prediction', 'children'),
    [Input('recency-slider', 'value'),
     Input('frequency-slider', 'value'),
     Input('monetary-slider', 'value')]
)
def predict_segment(recency, frequency, monetary):
    
    score = (90 - recency) * 0.3 + frequency * 0.3 + monetary * 0.4
    
    if score > 70:
        segment = "High Value"
        color = "#2ecc71"
    elif score > 40:
        segment = "Regular"
        color = "#3498db"
    else:
        segment = "Occasional"
        color = "#e74c3c"
    
    return html.Div([
        html.H4(f"Predicted Segment: {segment}", 
                style={'color': color, 'fontWeight': 'bold'}),
        html.P([
            "Based on: ",
            html.Br(),
            f"Recency: {recency} days since last order",
            html.Br(),
            f"Frequency: {frequency} orders per month",
            html.Br(),
            f"Monetary: ${monetary} average order value"
        ]),
        html.Div(f"Customer Score: {score:.1f}/100", 
                style={'marginTop': '10px', 'fontStyle': 'italic'})
    ], style={
        'padding': '20px',
        'backgroundColor': '#f8f9fa',
        'borderRadius': '5px',
        'boxShadow': '0 2px 4px rgba(0,0,0,0.1)'
    })


# Callback for time patterns
@app.callback(
    [Output('daily-pattern', 'figure'),
     Output('hourly-pattern', 'figure'),
     Output('time-insights', 'children')],
    [Input('day-range', 'value'),
     Input('hour-range', 'value'),
     Input('time-region', 'value')]
)
def update_time_patterns(day_range, hour_range, regions):
    df = load_data(preprocessed=False)
    
    if regions != 'all' and regions:
        df = df[df['customer_region'].astype(str).str[0].isin(regions)]
    
    # Daily pattern
    day_cols = [f'DOW_{i}' for i in range(day_range[0], day_range[1] + 1)]
    daily_data = df[day_cols].sum()
    day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    daily_fig = px.bar(
        x=[day_names[i] for i in range(day_range[0], day_range[1] + 1)],
        y=daily_data.values,
        title='Orders by Day of Week',
        color=daily_data.values,
        color_continuous_scale='Viridis'
    )
    
    # Hourly pattern
    hour_cols = [f'HR_{i}' for i in range(hour_range[0], hour_range[1] + 1)]
    hourly_data = df[hour_cols].sum()
    hourly_fig = go.Figure()
    hourly_fig.add_trace(go.Scatter(
        x=list(range(hour_range[0], hour_range[1] + 1)),
        y=hourly_data.values,
        fill='tozeroy',
        line=dict(color='rgb(111, 231, 219)')
    ))
    hourly_fig.update_layout(
        title='Orders by Hour',
        xaxis_title='Hour of Day',
        yaxis_title='Number of Orders'
    )
    
    peak_hour = hour_range[0] + hourly_data.values.argmax()
    peak_day = day_range[0] + daily_data.values.argmax()

    # weekend vs weekday ratio using total_orders
    weekday_avg = df[['DOW_0', 'DOW_1', 'DOW_2', 'DOW_3', 'DOW_4']].sum().mean()
    weekend_avg = df[['DOW_5', 'DOW_6']].sum().mean()
    weekend_weekday_ratio = weekend_avg / weekday_avg if weekday_avg > 0 else 0

    insights = html.Div([
        html.H4("Key Insights"),
        html.Ul([
            html.Li(f"Peak ordering hour: {peak_hour}:00"),
            html.Li(f"Peak ordering day: {day_names[peak_day]}"),
            html.Li(f"Total orders in selected timeframe: {int(hourly_data.sum())}"),
            html.Li(f"Lunch rush (11:00-14:00): {int(df[['HR_11', 'HR_12', 'HR_13']].sum().sum())} orders"),
            html.Li(f"Dinner rush (18:00-20:00): {int(df[['HR_18', 'HR_19', 'HR_20']].sum().sum())} orders"),
            html.Li(f"Early morning activity (4:00-5:00): {int(df[['HR_4', 'HR_5']].sum().sum())} orders"),
            html.Li(f"Quietest period (2:00-3:00): {int(df[['HR_2', 'HR_3']].sum().sum())} orders"),
            html.Li(f"Weekend vs Weekday ratio: {weekend_weekday_ratio:.2f}")
        ]),
        html.Div([
            html.H4("Segment Analysis"),
            update_time_insights(regions)
        ])
    ])
    
    return daily_fig, hourly_fig, insights

def update_time_insights(regions):
    df = load_data(preprocessed=False)
    if regions != 'all' and regions:
        df = df[df['customer_region'].astype(str).str[0].isin(regions)]
    
    segment_hours = enhance_time_pattern_analysis(df)
    
    insights = []
    for segment, hours in segment_hours.items():
        peak_hour = hours.idxmax().replace('HR_', '')
        insights.append(html.Div([
            html.H6(f"{segment.title()} Segment:"),
            html.P(f"Peak ordering hour: {peak_hour}:00"),
            html.P(f"Order concentration: {(hours[hours.idxmax()] / hours.sum() * 100):.1f}% during peak hour")
        ]))
    
    return html.Div(insights)


# Callback for cuisine analysis
@app.callback(
    [Output('cuisine-comparison', 'figure'),
     Output('cuisine-distribution', 'figure')],
    [Input('cuisine-regions', 'value'),
     Input('cuisine-viz-type', 'value'),
     Input('top-n-cuisines', 'value'),
     Input('cuisine-value-type', 'value')]  
)
def update_cuisine_analysis(regions, viz_type, top_n, value_type):  
    df = load_data(preprocessed=True)
    cuisine_cols = [col for col in df.columns if col.startswith('CUI_')]
    
    # Calculate regional preferences
    regional_preferences = {}
    for region in regions:
        region_data = df[df['customer_region'].astype(str).str[0] == region][cuisine_cols].mean()
        regional_preferences[f'Region {region}'] = region_data
    
    comparison_df = pd.DataFrame(regional_preferences)
    
    # Sort by total preference and get top N
    comparison_df['Total'] = comparison_df.sum(axis=1)
    top_cuisines = comparison_df.nlargest(top_n, 'Total').drop('Total', axis=1)
    
    # Convert to percentages if relative values selected
    if value_type == 'relative':
        for col in top_cuisines.columns:
            top_cuisines[col] = (top_cuisines[col] / top_cuisines[col].sum() * 100).round(1)
    
    # Create main comparison visualization
    if viz_type == 'bar':
        fig1 = px.bar(
            top_cuisines,
            barmode='group',
            title='Regional Cuisine Preferences' + (' (%)' if value_type == 'relative' else ''),
            color_discrete_sequence=px.colors.qualitative.Set3
        )
        if value_type == 'relative':
            fig1.update_layout(yaxis_range=[0, 100])
    elif viz_type == 'radar':
        fig1 = go.Figure()
        for region in regions:
            fig1.add_trace(go.Scatterpolar(
                r=top_cuisines[f'Region {region}'],
                theta=top_cuisines.index,
                name=f'Region {region}',
                fill='toself'
            ))
        fig1.update_layout(
            title='Regional Cuisine Preferences (Radar)' + (' (%)' if value_type == 'relative' else '')
        )
    else:  # heatmap
        fig1 = px.imshow(
            top_cuisines,
            title='Regional Cuisine Preferences Heatmap' + (' (%)' if value_type == 'relative' else ''),
            color_continuous_scale='RdYlBu_r'
        )
    
    # distribution visualization (bar instead of pie)
    overall_dist = df[cuisine_cols].mean().sort_values(ascending=False)[:top_n]
    fig2 = px.bar(
        x=overall_dist.index,
        y=overall_dist.values,
        title='Overall Cuisine Distribution',
        labels={'x': 'Cuisine Type', 'y': 'Order Proportion'},
        color_discrete_sequence=px.colors.qualitative.Set3
    )
    
    return fig1, fig2

# Callback for restaurant type visualization
@app.callback(
    Output('cuisine-graph', 'figure'),
    [Input('visualization-type', 'value'),
     Input('restaurant-type', 'value')]
)
def update_cuisine_graph(viz_type, restaurant_type):
    df = load_data(preprocessed=False)
    if viz_type == 'absolute':
        # Original absolute values visualization
        fig = px.bar(cuisine_data,
                    x='cuisine',
                    y=['chains', 'independent'],
                    title='Distribution by Cuisine Type and Restaurant Category',
                    barmode='stack')
    else:
        # Relative (percentage) visualization
        fig = px.bar(cuisine_data,
                    x='cuisine',
                    y=['chains_pct', 'independent_pct'],
                    title='Relative Distribution by Cuisine Type and Restaurant Category',
                    barmode='stack',
                    labels={'value': 'Percentage (%)',
                           'chains_pct': 'Chains',
                           'independent_pct': 'Independent'})
        fig.update_layout(yaxis_range=[0, 100])  
    
    # Apply restaurant type filter if specified
    if restaurant_type != 'all':
        if viz_type == 'absolute':
            fig.data = [fig.data[0 if restaurant_type == 'chains' else 1]]
        else:
            fig.data = [fig.data[0 if restaurant_type == 'chains' else 1]]
    
    return fig




# Callbacks for RFM visualizations


@app.callback(
    [Output('recency-dist', 'figure'),
     Output('frequency-dist', 'figure'),
     Output('monetary-dist', 'figure')],
    [Input('dummy-input', 'children')]
)
def update_rfm_distributions(dummy):
    
    layout_settings = {
        'showlegend': False,
        'height': 200,
        'xaxis_title': 'Days / Orders / Amount ($)',
        'yaxis_title': 'Percentage of Customers',
        'margin': dict(l=40, r=20, t=40, b=40)
    }
    
    # Recency Distribution
    recency_fig = px.bar(
        x=['0-3', '4-7', '8-14', '15-30', '>30'],
        y=[25, 22.5, 20, 17.5, 15],
        title='Days Since Last Order',
        labels={'x': 'Days', 'y': 'Percentage of Customers'}
    )
    recency_fig.update_layout(**layout_settings)
    
    # Frequency Distribution
    frequency_fig = px.bar(
        x=['1', '2-3', '4-5', '6-10', '>10'],
        y=[30, 25, 20, 15, 10],
        title='Monthly Order Frequency',
        labels={'x': 'Orders per Month', 'y': 'Percentage of Customers'}
    )
    frequency_fig.update_layout(**layout_settings)
    
    # Monetary Distribution
    monetary_fig = px.bar(
        x=['<20', '20-35', '35-50', '50-75', '>75'],
        y=[20, 25, 30, 15, 10],
        title='Average Order Value',
        labels={'x': 'Order Value ($)', 'y': 'Percentage of Customers'}
    )
    monetary_fig.update_layout(**layout_settings)
    
    return recency_fig, frequency_fig, monetary_fig

@app.callback(
    Output('segment-pie-chart', 'figure'),
    [Input('dummy-input', 'children')]
)
def update_segment_pie(dummy):
    segment_data = pd.DataFrame([
        {'name': 'High Value', 'value': 30, 'description': 'Top spenders with frequent orders'},
        {'name': 'Regular', 'value': 45, 'description': 'Consistent customers'},
        {'name': 'Occasional', 'value': 25, 'description': 'Infrequent buyers'}
    ])
    
    fig = px.pie(
        segment_data,
        values='value',
        names='name',
        title='Customer Segment Distribution',
        hover_data=['description']
    )
    fig.update_traces(textinfo='percent+label')
    fig.update_layout(height=400)
    
    return fig

@app.callback(
    Output('segment-comparison-chart', 'figure'),
    [Input('dummy-input', 'children')]
)
def update_segment_comparison(dummy):
    comparison_data = pd.DataFrame({
        'Segment': ['High Value', 'Regular', 'Occasional'],
        'Average Order Value ($)': [75, 45, 25],
        'Orders per Month': [8.5, 4.2, 1.8],
        'Promotion Usage (%)': [65, 45, 25]
    })
    
    fig = px.bar(
        comparison_data,
        x='Segment',
        y=['Average Order Value ($)', 'Orders per Month', 'Promotion Usage (%)'],
        title='Segment Metrics Comparison',
        barmode='group',
        labels={'value': 'Metric Value', 'variable': 'Metric'}
    )
    
    fig.update_layout(
        height=400,
        xaxis_title='Customer Segment',
        yaxis_title='Metric Value',
        legend_title='Metrics'
    )
    
    return fig

@app.callback(
    Output('segment-detail-content', 'children'),
    [Input('segment-detail-tabs', 'value')]
)
def update_segment_details(segment_value):
    characteristics = get_segment_characteristics()
    
    if segment_value == 'high-value':
        data = characteristics['High Value']
    elif segment_value == 'regular':
        data = characteristics['Regular']
    else:
        data = characteristics['Occasional']
    
    return html.Div([
        html.H5('Segment Characteristics'),
        html.P(data['description'], style={'marginBottom': '15px'}),
        html.Div([
            html.Div([
                html.Strong('Average Order Value: '),
                f"${data['avg_order']}"
            ], style={'marginBottom': '10px'}),
            html.Div([
                html.Strong('Orders per Month: '),
                str(data['orders_month'])
            ], style={'marginBottom': '10px'}),
            html.Div([
                html.Strong('Promotion Usage Rate: '),
                f"{data['promo_usage']}%"
            ], style={'marginBottom': '10px'}),
            html.Div([
                html.Strong('Top Cuisines: '),
                ', '.join(data['top_cuisines'])
            ])
        ])
    ], style={'padding': '15px', 'backgroundColor': '#f8f9fa', 'borderRadius': '5px'})

    
    
# Callback for segment prediction
@app.callback(
    Output('prediction-output', 'children'),
    [Input('predict-button', 'n_clicks')],
    [State('age-input', 'value'),
     State('order-value-input', 'value'),
     State('frequency-input', 'value')]
)
def predict_segment(n_clicks, age, order_value, frequency):
    if not n_clicks:
        return ""
    
    # Simple rule-based segmentation
    segment = ""
    if order_value > 50 and frequency > 4:
        segment = "Premium Customer"
    elif order_value > 35 or frequency > 3:
        segment = "Regular Customer"
    else:
        segment = "Occasional Customer"
    
    return html.Div([
        html.H4(f"Predicted Segment: {segment}"),
        html.P(f"Based on: Age={age}, Order Value={order_value}, Frequency={frequency}")
    ])

# Dummy Input div for RFM visualizations
app.layout = html.Div([
    create_layout(),
    html.Div(id='dummy-input', children='', style={'display': 'none'})
])



@app.callback(
    [Output('cuisine-dropdown-1', 'options'),
     Output('cuisine-dropdown-1-compare', 'options'),
     Output('cuisine-dropdown-2', 'options'),
     Output('cuisine-dropdown-2-compare', 'options')],
    [Input('dummy-input', 'children')]
)
def update_cuisine_dropdowns(_):
    df = load_data()
    cuisine_cols = [col.replace('CUI_', '') for col in df.columns if col.startswith('CUI_')]
    options = [{'label': cuisine, 'value': cuisine} for cuisine in cuisine_cols]
    return options, options, options, options


def update_cuisine_pattern(cuisine1, cuisine2, regions, chart_id):
    if not cuisine1:
        return {}, "Select a cuisine to see insights"
    
    df = load_data()
    if regions != 'all' and regions:
        df = df[df['customer_region'].astype(str).str[0].isin(regions)]
    
    fig = go.Figure()
    
    # Primary cuisine
    hours = list(range(24))
    orders1 = [df[df[f'CUI_{cuisine1}'] > 0][f'HR_{hour}'].sum() for hour in hours]
    
    fig.add_trace(go.Bar(
        x=hours,
        y=orders1,
        name=cuisine1,
        opacity=0.7
    ))
    
    # Comparison cuisine (if selected)
    if cuisine2:
        orders2 = [df[df[f'CUI_{cuisine2}'] > 0][f'HR_{hour}'].sum() for hour in hours]
        fig.add_trace(go.Bar(
            x=hours,
            y=orders2,
            name=cuisine2,
            opacity=0.5
        ))
    
    fig.update_layout(
        title=f'Hourly Order Distribution - {cuisine1}' + (f' vs {cuisine2}' if cuisine2 else ''),
        xaxis_title='Hour of Day',
        yaxis_title='Number of Orders',
        barmode='overlay',
        height=400
    )
    
    # Generate insights
    peak_hour = orders1.index(max(orders1))
    quiet_hours = [h for h, o in enumerate(orders1) if o < max(orders1) * 0.1]
    lunch_orders = sum(orders1[11:15])
    dinner_orders = sum(orders1[18:22])
    
    insights = html.Div([
        html.H4(f"Key Insights for {cuisine1}:"),
        html.Ul([
            html.Li(f"Peak ordering time: {peak_hour}:00"),
            html.Li(f"Quiet hours: {', '.join(f'{h}:00' for h in quiet_hours)}"),
            html.Li(f"Lunch vs Dinner ratio: {lunch_orders/dinner_orders:.2f}"),
            html.Li("Marketing recommendations:",
                   style={'marginTop': '10px', 'fontWeight': 'bold'}),
            html.Ul([
                html.Li(f"Focus promotions around {peak_hour-1}:00 - {peak_hour+1}:00"),
                html.Li("Consider special promotions during quiet hours to boost sales") 
                       if quiet_hours else None
            ])
        ])
    ])
    
    return fig, insights


@app.callback(
    [Output('cuisine-pattern-1', 'figure'),
     Output('cuisine-insights-1', 'children')],
    [Input('cuisine-dropdown-1', 'value'),
     Input('cuisine-dropdown-1-compare', 'value'),
     Input('time-region', 'value')]
)
def update_cuisine_pattern_1(cuisine1, cuisine2, regions):
    return update_cuisine_pattern(cuisine1, cuisine2, regions, 1)

@app.callback(
    [Output('cuisine-pattern-2', 'figure'),
     Output('cuisine-insights-2', 'children')],
    [Input('cuisine-dropdown-2', 'value'),
     Input('cuisine-dropdown-2-compare', 'value'),
     Input('time-region', 'value')]
)
def update_cuisine_pattern_2(cuisine1, cuisine2, regions):
    return update_cuisine_pattern(cuisine1, cuisine2, regions, 2)


# Callback for segment prediction
@app.callback(
    Output('predicted-segment', 'children'),
    [Input('recency-slider', 'value'),
     Input('frequency-slider', 'value'),
     Input('monetary-slider', 'value')]
)
def predict_segment(recency, frequency, monetary):
    # Simple rule-based prediction
    if monetary > 60 and frequency > 6:
        segment = "High Value"
    elif monetary > 35 or frequency > 3:
        segment = "Regular"
    else:
        segment = "Occasional"
    
    return html.Div([
        html.H4(f"Predicted Segment: {segment}"),
        html.P(f"Based on: Recency={recency} days, Frequency={frequency} orders/month, Monetary=${monetary}")
    ], style={'padding': '15px', 'backgroundColor': '#f0f0f0', 'borderRadius': '5px'})



@app.callback(
    Output('cuisine-correlation-heatmap', 'figure'),
    Input('cuisine-correlation-threshold', 'value')
)
def update_cuisine_correlation(threshold):
    df = load_data()
    correlations = calculate_cuisine_correlations(df)
    
    # Filter correlations by threshold
    correlations[abs(correlations) < threshold] = 0
    
    return px.imshow(
        correlations,
        title='Cuisine Type Correlations',
        labels={'color': 'Correlation'},
        color_continuous_scale='RdBu_r'
    )


@app.callback(
    Output('cuisine-selection', 'options'),
    Input('dummy-input', 'children')
)
def update_cuisine_options(_):
    df = load_data()
    cuisine_cols = [col.replace('CUI_', '') for col in df.columns if col.startswith('CUI_')]
    return [{'label': cuisine, 'value': cuisine} for cuisine in cuisine_cols]


@app.callback(
    Output('time-pattern-analysis', 'figure'),
    [Input('cuisine-selection', 'value'),
     Input('time-aggregation', 'value')]
)
def update_time_pattern_analysis(selected_cuisines, aggregation):
    df = load_data()
    patterns = analyze_cuisine_time_patterns(df)
    
    # Handle case when no cuisines are selected
    if not selected_cuisines:
        # Use first cuisine as default
        selected_cuisines = [patterns.columns[0]]
    
    if aggregation == 'hour':
        data = patterns[selected_cuisines]
    else:
        # Create day parts DataFrame 
        day_parts = pd.DataFrame({
            'Morning': patterns.iloc[6:11].sum(),
            'Afternoon': patterns.iloc[11:17].sum(),
            'Evening': patterns.iloc[17:23].sum(),
            'Night': patterns.iloc[[23,0,1,2,3,4,5]].sum()
        }).T  # Transpose the DataFrame to get correct structure
        
        data = day_parts[selected_cuisines]
    
    fig = px.line(
        data,
        title='Cuisine Ordering Patterns',
        labels={'value': 'Order Count', 'index': aggregation.capitalize()}
    )
    
    fig.update_layout(
        xaxis_title=aggregation.capitalize(),
        yaxis_title='Number of Orders',
        legend_title='Cuisines'
    )
    
    return fig

if __name__ == '__main__':
    app.run_server(debug=True)
    


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A

Open any browser and go to: http://127.0.0.1:8050 or http://localhost:8050

# Clusters

In [3]:
df_path = "df_final_clustered.csv"  # Replace with your actual file path
df = pd.read_csv(df_path)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A

In [None]:
# Import necessary libraries
import dash
from dash import dcc, html, Input, Output, State
from dash.exceptions import PreventUpdate
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LogisticRegression
import io
import base64
from datetime import datetime, timedelta

app = dash.Dash(__name__)
app.title = "Mega Customer Segmentation Dashboard"

# Load data
df_path = "df_final_clustered.csv"  
df = pd.read_csv(df_path)

# Define cluster descriptions
cluster_descriptions = {
    0: "Low engagement - Low consumption, endorsement, and product count. More influenced by promotion.",
    1: "Low frequency but big orders - High money and product per order but not many orders. Increased breakfast activity.",
    2: "Diversity seekers - Frequent orders, repeated customers. High engagement with different cuisines.",
    3: "Frequent American - Chain restaurant preference, afternoon weekday activity. High American cuisine consumption.",
    4: "Expensive tastes - High spending and products ordered. Asian cuisine preference, not promotion dependent."
}

# Initialize model and scaler BEFORE the app layout
print("Initializing model and scaler...")

# Prepare features and target
features = ['monetary', 'total_orders', 'days_as_customer']
target = 'merged_labels'

# Get the data
X = df[features].copy()
y = df[target]

# Store the actual ranges from the data for reference
feature_ranges = {
    'monetary': (float(X['monetary'].min()), float(X['monetary'].max())),
    'total_orders': (float(X['total_orders'].min()), float(X['total_orders'].max())),
    'days_as_customer': (float(X['days_as_customer'].min()), float(X['days_as_customer'].max()))
}

print("Feature ranges:", feature_ranges)

# Initialize and fit the scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create and train the model
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Using a more complex decision tree for better predictions
dt_model = DecisionTreeClassifier(
    max_depth=10,  # Increased depth for more complex patterns
    min_samples_split=50,  # Prevent overfitting
    min_samples_leaf=20,  # Ensure robust leaf nodes
    random_state=42
)
dt_model.fit(X_train, y_train)

print("Model and scaler initialization complete!")


# Dimensionality Reduction
tsne = TSNE(n_components=2, random_state=42)
df['tsne_x'], df['tsne_y'] = zip(*tsne.fit_transform(X_scaled))
umap = UMAP(n_components=2, random_state=42)
df['umap_x'], df['umap_y'] = zip(*umap.fit_transform(X_scaled))

# Dimensionality Reduction setup
print("Performing dimensionality reduction...")

# Prepare features for dimensionality reduction
reduction_features = ['monetary', 'total_orders', 'days_as_customer', 
                     'products_per_order', 'money_per_order']
X_reduction = df[reduction_features].copy()

# Handle any missing values
X_reduction = X_reduction.fillna(X_reduction.mean())

# Scale the data
scaler_dr = StandardScaler()
X_scaled_dr = scaler_dr.fit_transform(X_reduction)

# Perform t-SNE
print("Computing t-SNE...")
tsne = TSNE(n_components=2, random_state=42)
tsne_coords = tsne.fit_transform(X_scaled_dr)
df['tsne_x'] = tsne_coords[:, 0]
df['tsne_y'] = tsne_coords[:, 1]

# Perform UMAP
print("Computing UMAP...")
umap_model = UMAP(n_components=2, random_state=42)
umap_coords = umap_model.fit_transform(X_scaled_dr)
df['umap_x'] = umap_coords[:, 0]
df['umap_y'] = umap_coords[:, 1]

# Define options for dropdowns
region_options = [
    {'label': f'Region {i}', 'value': i} 
    for i in [2360, 4660, 8670] 
]

payment_options = [
    {'label': 'Digital', 'value': 'DIGI'},
    {'label': 'Cash', 'value': 'CASH'},
    {'label': 'Card', 'value': 'CARD'}
]


# App Layout
app.layout = html.Div([
    # Header
    html.Div([
        html.H1('ABCDEats Customer Segmentation Analysis',
                style={'textAlign': 'center', 'color': '#2c3e50', 'padding': '20px'}),
        html.P('Interactive dashboard to explore customer segments and their characteristics',
               style={'textAlign': 'center', 'color': '#7f8c8d'})
    ]),
    
    # Main content
    html.Div([
        # Left panel - Filters and Controls
        html.Div([
            html.H3("Filters and Controls"),
            
            # Cluster selector
            html.Label("Select Cluster(s):"),
            dcc.Dropdown(
                id='cluster-selector',
                options=[
                    {'label': f'Cluster {i}: {desc[:30]}...', 'value': i}
                    for i, desc in cluster_descriptions.items()
                ],
                value=None,
                multi=True,
                style={'marginBottom': '20px'}
            ),
            
            # Age Range Filter
            html.Label("Age Range:"),
            dcc.RangeSlider(
                id='age-range',
                min=15,
                max=40,
                value=[15, 40],
                marks={i: str(i) for i in range(15, 41, 5)},
                step=1
            ),
            
            # Region Filter
            html.Label("Region:", style={'marginTop': '20px'}),
            dcc.Dropdown(
                id='region-selector',
                options=region_options,
                value=None,
                multi=True
            ),
            
            # Payment Method Filter
            html.Label("Payment Method:", style={'marginTop': '20px'}),
            dcc.Dropdown(
                id='payment-method-selector',
                options=payment_options,
                value=None,
                multi=True
            ),
            
            # Cluster Overview
            html.Div([
                html.H4('Cluster Overview', style={'marginTop': '30px'}),
                html.Div(id='cluster-overview-text', 
                        style={'padding': '10px', 'backgroundColor': '#f7f9fc', 'borderRadius': '5px'})
            ])
        ], style={'width': '25%', 'padding': '20px', 'backgroundColor': '#f8f9fa'}),
        
        # Right panel - Visualizations
        html.Div([
            dcc.Tabs([
                # Demographics Tab
                dcc.Tab(label='Demographics', children=[
                    html.Div([
                        dcc.Graph(id='cluster-distribution'),
                        dcc.Graph(id='age-distribution')
                    ], style={'display': 'flex'}),
                    html.Div([
                        dcc.Graph(id='region-distribution'),
                        dcc.Graph(id='payment-method-distribution')
                    ], style={'display': 'flex'})
                ]),
                
                # Spending Patterns Tab
                dcc.Tab(label='Spending Patterns', children=[
                    html.Div([
                        dcc.Graph(id='customer-value-scatter'),
                        dcc.Graph(id='order-metrics')
                    ], style={'display': 'flex'}),
                    html.Div([
                        dcc.Graph(id='cuisine-preferences'),
                        dcc.Graph(id='promotion-analysis')
                    ], style={'display': 'flex'})
                ]),
                
                # Temporal Patterns Tab
                dcc.Tab(label='Temporal Patterns', children=[
                    html.Div([
                        dcc.Graph(id='weekday-orders'),
                        dcc.Graph(id='hour-distribution')
                    ], style={'display': 'flex'})
                ]),
                
                # Dimensionality Reduction Tab
                dcc.Tab(label='Dimensionality Reduction', children=[
                    html.Div([
                        html.H3("t-SNE Visualization"),
                        dcc.Graph(id='tsne-chart'),
                        html.H3("UMAP Visualization"),
                        dcc.Graph(id='umap-chart'),
                    ])
                ]),
                
                # Prediction Tab
                dcc.Tab(label='Prediction', children=[
                    html.Div([
                        html.H3("Predict Customer Cluster"),
                        html.Div([
                            html.Label("Enter Customer Attributes:"),
                            html.Div([
                                html.Label("Monetary Value ($):"),
                                dcc.Input(
                                    id='input-monetary',
                                    type='number',
                                    placeholder=""
                                ),
                            ], className='mb-2'),
                            html.Div([
                                html.Label("Total Orders:"),
                                dcc.Input(
                                    id='input-orders',
                                    type='number',
                                    placeholder=""
                                ),
                            ], className='mb-2'),
                            html.Div([
                                html.Label("Days as Customer:"),
                                dcc.Input(
                                    id='input-days',
                                    type='number',
                                    placeholder=""
                                ),
                            ], className='mb-2'),
                            html.Button(
                                "Predict Cluster",
                                id='predict-button',
                                className='mt-4'
                            ),
                            html.Div(id='prediction-result', className='mt-4')
                        ], className='p-4')
                    ])
                ]),
                
                # Cluster Insights Tab
                dcc.Tab(label='Cluster Insights', children=[
                    html.Div([
                        html.H3("Cluster Recommendations"),
                        dcc.Graph(id='insights-chart'),
                    ])
                ])
            ])
        ], style={'width': '75%', 'padding': '20px'})
    ], style={'display': 'flex'}),
    
    # Customer Insights Panel
    html.Div([
        html.H3("Customer Insights Generator"),
        html.Div([
            html.Label("Enter Customer Attributes:"),
            dcc.Input(id='age-input', type='number', placeholder='Age'),
            dcc.Dropdown(
                id='region-input',
                options=region_options,
                placeholder='Select Region'
            ),
            dcc.Dropdown(
                id='payment-method-input',
                options=payment_options,
                placeholder='Payment Method'
            ),
            html.Button('Generate Insights', id='generate-insights-button'),
            html.Div(id='customer-insights-output')
        ], style={'padding': '20px'})
    ], style={'margin': '20px'}),

                # Anomaly Detection Tab
                dcc.Tab(label='Anomaly Detection', children=[
                    html.Div([
                        html.H3("Customer Behavior Anomalies"),
                        dcc.Graph(id='anomaly-chart'),
                        html.Div(id='anomaly-stats'),
                    ])
                ]),
                
                # Churn Analysis Tab
                dcc.Tab(label='Churn Analysis', children=[
                    html.Div([
                        html.H3("Churn Risk Analysis"),
                        dcc.Graph(id='churn-chart'),
                        html.Div(id='high-risk-customers'),
                    ])
                ]),
                
                # Export Tab
                dcc.Tab(label='Export', children=[
                    html.Div([
                        html.H3("Export Cluster Report"),
                        dcc.Dropdown(
                            id='export-cluster-selector',
                            options=[{'label': f'Cluster {i}', 'value': i} 
                                   for i in range(5)],
                            value=0
                        ),
                        html.Button('Generate Report', id='generate-report-btn'),
                        html.Div(id='report-output'),
                        html.A(
                            html.Button('Download Report'),
                            id='download-report',
                            style={'display': 'none'}
                        )
                    ])
                ])
])

# Utility functions
def get_peak_hours(df):
    """Get the peak ordering hours for a group of customers"""
    time_cols = ['hours_night', 'hours_breakfast', 'hours_lunch', 
                 'hours_afternoon', 'hours_dinner']
    peak_time = df[time_cols].mean().idxmax()
    return peak_time.replace('hours_', '').title()

def get_top_cuisines(df, n=3):
    """Get top n cuisines for a group of customers"""
    cuisine_cols = [col for col in df.columns if col.startswith('CUI_')]
    cuisine_means = df[cuisine_cols].mean()
    top_cuisines = cuisine_means.nlargest(n).index
    return [cuisine.replace('CUI_', '').title() for cuisine in top_cuisines]

def detect_anomalies(df):
    """Detect anomalous customer behavior using Isolation Forest"""
    features = ['monetary', 'total_orders', 'days_as_customer']
    iso_forest = IsolationForest(contamination=0.1, random_state=42)
    df['is_anomaly'] = iso_forest.fit_predict(df[features])
    return df['is_anomaly'] == -1

def calculate_churn_probability(df):
    """Calculate churn probability based on customer behavior"""

    df_copy = df.copy()
    
    
    if not pd.api.types.is_datetime64_any_dtype(df_copy['last_order']):

        min_date = datetime(2023, 1, 1)
        max_date = datetime(2024, 1, 1)
        date_range = (max_date - min_date).days
        df_copy['last_order'] = df_copy['last_order'].apply(
            lambda x: min_date + timedelta(days=int(x * date_range))
        )
    
    # Calculate days since last order
    df_copy['days_since_last_order'] = (datetime.now() - 
        pd.to_datetime(df_copy['last_order'])).dt.days
    
    # Define churn based on multiple factors
    df_copy['is_churned'] = (
        (df_copy['days_since_last_order'] > 30) & 
        (df_copy['total_orders'] < df_copy['total_orders'].median())
    )
    
    if len(df_copy['is_churned'].unique()) < 2:
        df_copy.loc[df_copy['monetary'] > df_copy['monetary'].median(), 'is_churned'] = False
    
    features = ['monetary', 'total_orders', 'days_as_customer', 
                'days_since_last_order']
    
    model = LogisticRegression(random_state=42)
    model.fit(df_copy[features], df_copy['is_churned'])
    return model.predict_proba(df_copy[features])[:, 1]

def generate_cluster_report(df, cluster):
    """Generate a detailed report for a specific cluster"""
    cluster_data = df[df['merged_labels'] == cluster]
    
    report = f"""Cluster {cluster} Analysis Report
    Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M')}
    
    Key Metrics:
    - Total Customers: {len(cluster_data):,}
    - Average Order Value: ${cluster_data['monetary'].mean():.2f}
    - Average Orders per Customer: {cluster_data['total_orders'].mean():.1f}
    
    Top Cuisines:
    {', '.join(get_top_cuisines(cluster_data))}
    
    Peak Activity Hours:
    {get_peak_hours(cluster_data)}
    
    Customer Profile:
    - Most Common Age Range: {cluster_data['age_range'].mode().iloc[0]}
    - Primary Region: {cluster_data['customer_region'].mode().iloc[0]}
    - Digital Payment Rate: {(cluster_data['payment_method'] == 'DIGI').mean()*100:.1f}%
    """
    return report



# Callbacks
@app.callback(
    [Output('cluster-distribution', 'figure'),
     Output('age-distribution', 'figure'),
     Output('region-distribution', 'figure'),
     Output('payment-method-distribution', 'figure'),
     Output('customer-value-scatter', 'figure'),
     Output('order-metrics', 'figure'),
     Output('cuisine-preferences', 'figure'),
     Output('weekday-orders', 'figure'),
     Output('hour-distribution', 'figure'),
     Output('cluster-overview-text', 'children')],
    [Input('cluster-selector', 'value'),
     Input('age-range', 'value'),
     Input('region-selector', 'value'),
     Input('payment-method-selector', 'value')]
)
def update_all_graphs(selected_clusters, age_range, regions, payment_methods):
    """Update all visualizations based on user selections"""
    filtered_df = df.copy()
    
    if selected_clusters:
        filtered_df = filtered_df[filtered_df['merged_labels'].isin(selected_clusters)]
    if regions:
        filtered_df = filtered_df[filtered_df['customer_region'].isin(regions)]
    if payment_methods:
        filtered_df = filtered_df[filtered_df['payment_method'].isin(payment_methods)]
    
    if len(filtered_df) == 0:
        return [go.Figure() for _ in range(9)] + [html.Div("No data available for selected filters")]
    
    overview_text = html.Div([
        html.P(f"Total Customers: {len(filtered_df):,}"),
        html.P(f"Average Order Value: ${filtered_df['money_per_order'].mean():.2f}"),
        html.P(f"Most Common Age Range: {filtered_df['age_range'].mode().iloc[0] if not filtered_df['age_range'].empty else 'N/A'}"),
        html.P(f"Most Common Region: {filtered_df['customer_region'].mode().iloc[0] if not filtered_df['customer_region'].empty else 'N/A'}"),
        html.P(f"Digital Payment Rate: {(filtered_df['payment_method'] == 'DIGI').mean()*100:.1f}%")
    ])
    
    # Generate all figures
    cluster_dist = px.pie(
        values=filtered_df['merged_labels'].value_counts().values,
        names=filtered_df['merged_labels'].value_counts().index.map(lambda x: f'Cluster {x}'),
        title='Distribution of Customers Across Clusters'
    )
    
    age_dist = px.histogram(
        filtered_df,
        x='age_range',
        title='Age Distribution'
    )
    
    region_counts = filtered_df['customer_region'].value_counts().reset_index()
    region_counts.columns = ['Region', 'Count']
    region_dist = px.bar(
        region_counts,
        x='Region',
        y='Count',
        title='Customer Distribution by Region'
    )
    
    payment_counts = filtered_df['payment_method'].value_counts().reset_index()
    payment_counts.columns = ['Method', 'Count']
    payment_dist = px.pie(
        payment_counts,
        values='Count',
        names='Method',
        title='Payment Method Distribution'
    )
    
    value_scatter = px.scatter(
        filtered_df,
        x='total_orders',
        y='monetary',
        color='merged_labels',
        title='Customer Value Analysis'
    )
    
    metrics_data = pd.DataFrame({
        'Metric': ['Avg Orders', 'Avg Products/Order', 'Avg Money/Order'],
        'Value': [
            filtered_df['total_orders'].mean(),
            filtered_df['products_per_order'].mean(),
            filtered_df['money_per_order'].mean()
        ]
    })
    metrics = px.bar(metrics_data, x='Metric', y='Value', title='Key Order Metrics')
    
    cuisine_cols = [col for col in filtered_df.columns if col.startswith('CUI_')]
    cuisine_means = filtered_df[cuisine_cols].mean().sort_values(ascending=False)
    cuisine_data = pd.DataFrame({
        'Cuisine': cuisine_means.index.str.replace('CUI_', ''),
        'Preference': cuisine_means.values
    })
    cuisine_prefs = px.bar(cuisine_data, x='Cuisine', y='Preference', title='Cuisine Preferences')
    
    weekday_data = pd.DataFrame({
        'Day': ['Weekday', 'Weekend'],
        'Orders': [1 - filtered_df['weekends'].mean(), filtered_df['weekends'].mean()]
    })
    weekday_pattern = px.bar(weekday_data, x='Day', y='Orders', title='Weekday vs Weekend Orders')
    
    time_cols = ['hours_night', 'hours_breakfast', 'hours_lunch', 
                 'hours_afternoon', 'hours_dinner']
    time_means = filtered_df[time_cols].mean()
    time_data = pd.DataFrame({
        'Time': time_means.index.str.replace('hours_', '').str.title(),
        'Orders': time_means.values
    })
    hour_pattern = px.line(time_data, x='Time', y='Orders', title='Order Distribution by Time of Day')
    
    overview_text = html.Div([
        html.P(f"Total Customers: {len(filtered_df):,}"),
        html.P(f"Average Order Value: ${filtered_df['money_per_order'].mean():.2f}"),
        html.P(f"Most Common Age Range: {filtered_df['age_range'].mode().iloc[0]}"),
        html.P(f"Most Common Region: {filtered_df['customer_region'].mode().iloc[0]}"),
        html.P(f"Digital Payment Rate: {(filtered_df['payment_method'] == 'DIGI').mean()*100:.1f}%")
    ])
    
    return (cluster_dist, age_dist, region_dist, payment_dist, value_scatter, 
            metrics, cuisine_prefs, weekday_pattern, hour_pattern, overview_text)

@app.callback(
    Output('prediction-result', 'children'),
    [Input('predict-button', 'n_clicks')],
    [State('input-monetary', 'value'),
     State('input-orders', 'value'),
     State('input-days', 'value')]
)
def predict_cluster(n_clicks, monetary, orders, days):
    if not n_clicks:
        raise PreventUpdate
    if None in [monetary, orders, days]:
        return "Please enter all required inputs."
    
    try:
        # Convert inputs to float
        input_data = np.array([[float(monetary), float(orders), float(days)]])
        
        # Check input ranges
        if monetary < 0 or orders < 0 or days < 0:
            return "Please enter positive values for all inputs."
            
        # Get actual data ranges for validation and display
        ranges = {
            'monetary': (df['monetary'].min(), df['monetary'].max()),
            'total_orders': (df['total_orders'].min(), df['total_orders'].max()),
            'days_as_customer': (df['days_as_customer'].min(), df['days_as_customer'].max())
        }
        
        # Validate inputs are within reasonable ranges
        if monetary > ranges['monetary'][1] * 2:  # Allow some flexibility above max
            return f"Monetary value seems too high. Typical range is ${ranges['monetary'][0]:.2f} - ${ranges['monetary'][1]:.2f}"
        if orders > ranges['total_orders'][1] * 2:
            return f"Order count seems too high. Typical range is {ranges['total_orders'][0]:.0f} - {ranges['total_orders'][1]:.0f}"
        if days > ranges['days_as_customer'][1] * 2:
            return f"Days as customer seems too high. Typical range is {ranges['days_as_customer'][0]:.0f} - {ranges['days_as_customer'][1]:.0f}"
        
        # Scale the input data using the same scaler used for training
        scaled_input = scaler.transform(input_data)
        
        # Get prediction and probabilities
        prediction = dt_model.predict(scaled_input)[0]
        probabilities = dt_model.predict_proba(scaled_input)[0]
        max_prob = max(probabilities) * 100
        
        # Get probability for each cluster
        cluster_probs = {i: f"{prob*100:.1f}%" for i, prob in enumerate(probabilities)}
        
        return html.Div([
            html.P(f"Predicted Cluster: {prediction}", style={'fontWeight': 'bold'}),
            html.P(f"Confidence: {max_prob:.1f}%"),
            html.P("Probability by cluster:"),
            html.Ul([
                html.Li(f"Cluster {cluster}: {prob}") 
                for cluster, prob in cluster_probs.items()
            ]),
            html.P("Your inputs:"),
            html.Ul([
                html.Li(f"Monetary: ${monetary:,.2f} (Typical range: ${ranges['monetary'][0]:,.2f} - ${ranges['monetary'][1]:,.2f})"),
                html.Li(f"Orders: {orders} (Typical range: {ranges['total_orders'][0]:.0f} - {ranges['total_orders'][1]:.0f})"),
                html.Li(f"Days as Customer: {days} (Typical range: {ranges['days_as_customer'][0]:.0f} - {ranges['days_as_customer'][1]:.0f})")
            ]),
            html.P(f"Cluster Description: {cluster_descriptions[prediction]}")
        ])
        
    except ValueError as e:
        return f"Error processing inputs: {str(e)}"
    except Exception as e:
        return f"An error occurred: {str(e)}"

@app.callback(
    Output('customer-insights-output', 'children'),
    [Input('generate-insights-button', 'n_clicks')],
    [State('age-input', 'value'),
     State('region-input', 'value'),
     State('payment-method-input', 'value')]
)
def generate_customer_insights(n_clicks, age, region, payment_method):
    if not n_clicks:
        raise PreventUpdate
        
    if not all([age, region, payment_method]):
        return html.Div("Please fill in all customer attributes", style={'color': 'red'})
    
    age_ranges = {
        (15, 20): '15-20',
        (21, 25): '20-25',
        (26, 30): '25-30',
        (31, 40): '30-40'
    }
    
    age_range = next((range_str for (start, end), range_str in age_ranges.items() 
                     if start <= age <= end), None)
    
    if not age_range:
        return html.Div("Age must be between 15 and 40", style={'color': 'red'})
    
    # Filter similar customers
    similar_customers = df[
        (df['age_range'] == age_range) &
        (df['customer_region'] == region) &
        (df['payment_method'] == payment_method)
    ]
    
    if len(similar_customers) == 0:
        return html.Div([
            html.P("No similar customers found in the database. Please try different criteria:", 
                  style={'color': 'orange'}),
            html.Ul([
                html.Li(f"Age Range: {age_range}"),
                html.Li(f"Region: {region}"),
                html.Li(f"Payment Method: {payment_method}")
            ])
        ])
    
    # Get most common cluster for similar customers
    predicted_cluster = similar_customers['merged_labels'].mode().iloc[0]
    
    # Calculate insights from the actual data
    insights = html.Div([
        html.H4("Customer Insights", className='mb-4'),
        html.P(f"Likely Cluster: Cluster {predicted_cluster}"),
        html.P(f"Cluster Description: {cluster_descriptions[predicted_cluster]}"),
        html.H5("Recommendations based on similar customers:"),
        html.Ul([
            html.Li(f"Number of Similar Customers Found: {len(similar_customers)}"),
            html.Li(f"Average Order Value: ${similar_customers['money_per_order'].mean():.2f}"),
            html.Li(f"Typical Order Frequency: {similar_customers['total_orders'].mean():.1f} orders"),
            html.Li(f"Most Popular Time: {get_peak_hours(similar_customers)}"),
            html.Li(f"Preferred Cuisines: {', '.join(get_top_cuisines(similar_customers))}")
        ])
    ])
    
    return insights

@app.callback(
    Output('insights-chart', 'figure'),
    [Input('cluster-selector', 'value')]
)
def update_insights_chart(selected_clusters):
    insights = {
        0: "Focus on low-budget campaigns for Cluster 0.",
        1: "Upsell premium meals to Cluster 1.",
        2: "Offer loyalty discounts to Cluster 2.",
        3: "Promote beverages to Cluster 3.",
        4: "Highlight health-conscious meals to Cluster 4."
    }
    filtered_insights = {k: v for k, v in insights.items() if k in selected_clusters} if selected_clusters else insights
    fig = go.Figure(data=[go.Bar(
        x=list(filtered_insights.keys()),
        y=list(filtered_insights.values()),
        text=list(filtered_insights.values()),
        textposition='auto',
    )])
    fig.update_layout(
        title="Business Recommendations",
        showlegend=False,
        height=400
    )
    return fig

@app.callback(
    [Output('tsne-chart', 'figure'),
     Output('umap-chart', 'figure')],
    [Input('cluster-selector', 'value')]
)
def update_dimension_reduction_charts(selected_clusters):
    filtered_df = df if not selected_clusters else df[df['merged_labels'].isin(selected_clusters)]
    
    # Create t-SNE visualization
    tsne_fig = px.scatter(
        filtered_df,
        x='tsne_x',
        y='tsne_y',
        color='merged_labels',
        title='t-SNE Visualization of Customer Segments',
        labels={
            'merged_labels': 'Cluster',
            'tsne_x': 't-SNE Component 1',
            'tsne_y': 't-SNE Component 2'
        },
        hover_data=['monetary', 'total_orders', 'days_as_customer']
    )
    tsne_fig.update_traces(marker=dict(size=8))
    tsne_fig.update_layout(
        height=600,
        legend_title="Cluster",
        hovermode='closest'
    )
    
    # Create UMAP visualization
    umap_fig = px.scatter(
        filtered_df,
        x='umap_x',
        y='umap_y',
        color='merged_labels',
        title='UMAP Visualization of Customer Segments',
        labels={
            'merged_labels': 'Cluster',
            'umap_x': 'UMAP Component 1',
            'umap_y': 'UMAP Component 2'
        },
        hover_data=['monetary', 'total_orders', 'days_as_customer']
    )
    umap_fig.update_traces(marker=dict(size=8))
    umap_fig.update_layout(
        height=600,
        legend_title="Cluster",
        hovermode='closest'
    )
    
    return tsne_fig, umap_fig

print("Dimensionality reduction complete!")

@app.callback(
    [Output('anomaly-chart', 'figure'),
     Output('anomaly-stats', 'children')],
    [Input('cluster-selector', 'value')]
)
def update_anomaly_detection(selected_clusters):
    filtered_df = df.copy()
    if selected_clusters:
        filtered_df = filtered_df[filtered_df['merged_labels'].isin(selected_clusters)]
    
    # Detect anomalies
    anomalies = detect_anomalies(filtered_df)
    
    # Create scatter plot
    fig = px.scatter(
        filtered_df,
        x='monetary',
        y='total_orders',
        color=anomalies.astype(str),
        title='Customer Behavior Anomalies',
        labels={'color': 'Is Anomaly'}
    )
    
    # Calculate statistics
    stats = html.Div([
        html.H4("Anomaly Statistics"),
        html.P(f"Total Anomalies Detected: {anomalies.sum()}"),
        html.P(f"Anomaly Rate: {(anomalies.sum() / len(filtered_df) * 100):.1f}%"),
    ])
    
    return fig, stats

@app.callback(
    [Output('churn-chart', 'figure'),
     Output('high-risk-customers', 'children')],
    [Input('cluster-selector', 'value')]
)
def update_churn_analysis(selected_clusters):
    filtered_df = df.copy()
    if selected_clusters:
        filtered_df = filtered_df[filtered_df['merged_labels'].isin(selected_clusters)]
    
    # Calculate churn probabilities
    filtered_df['churn_prob'] = calculate_churn_probability(filtered_df)
    
    # Create histogram
    fig = px.histogram(
        filtered_df,
        x='churn_prob',
        title='Distribution of Churn Probability',
        nbins=20
    )
    
    # Identify high-risk customers
    high_risk = filtered_df[filtered_df['churn_prob'] > 0.7]
    stats = html.Div([
        html.H4("High Risk Customer Statistics"),
        html.P(f"Number of High Risk Customers: {len(high_risk)}"),
        html.P(f"Percentage of Customer Base: {(len(high_risk) / len(filtered_df) * 100):.1f}%"),
    ])
    
    return fig, stats

@app.callback(
    [Output('download-report', 'href'),
     Output('download-report', 'style'),
     Output('report-output', 'children')],
    [Input('generate-report-btn', 'n_clicks')],
    [State('export-cluster-selector', 'value')]
)
def generate_report(n_clicks, selected_cluster):
    if not n_clicks:
        raise PreventUpdate
    
    # Generate report content
    report_content = generate_cluster_report(df, selected_cluster)
    
    # Create download link
    content_type = 'text/plain'
    b64 = base64.b64encode(report_content.encode()).decode()
    href = f'data:{content_type};base64,{b64}'
    
    # Show preview and download button
    return href, {'display': 'block'}, html.Pre(report_content)

if __name__ == '__main__':
    app.run_server(debug=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A

Initializing model and scaler...
Feature ranges: {'monetary': (0.0, 1.0), 'total_orders': (0.0, 1.0), 'days_as_customer': (0.0, 1.0)}
Model and scaler initialization complete!



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



Performing dimensionality reduction...
Computing t-SNE...


Open any browser and go to: http://127.0.0.1:8050 or http://localhost:8050