In [1]:
!pip install gradio pandas numpy matplotlib seaborn scikit-learn xgboost plotly

Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6

In [2]:
from google.colab import files
uploaded = files.upload()
# Select your personal_finance_large.csv file when prompted

Saving personal_finance_large.csv to personal_finance_large.csv


In [3]:
class FinancialAnalyzer:
    def __init__(self):
        # Initialize base models
        self.rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
        self.gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
        self.xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)

        # Create voting ensemble
        self.ensemble = VotingRegressor([
            ('rf', self.rf_model),
            ('gb', self.gb_model),
            ('xgb', self.xgb_model)
        ])

        self.feature_importance = None
        self.model_performances = {}
        self.data_stats = {}
        self.scaler = StandardScaler()
        self.category_encoder = LabelEncoder()
        self.subcategory_encoder = LabelEncoder()

    def load_and_prepare_data(self, filename='data/personal_finance.csv'):
        """Load and prepare the financial dataset"""
        # Load the dataset
        df = pd.read_csv(filename)

        # Basic preprocessing
        df['Date'] = pd.to_datetime(df['Date'])

        # Add time-based features
        df['Month'] = df['Date'].dt.month
        df['Year'] = df['Date'].dt.year
        df['DayOfWeek'] = df['Date'].dt.dayofweek
        df['DayOfMonth'] = df['Date'].dt.day
        df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)
        df['Season'] = pd.cut(df['Month'],
                            bins=[0, 3, 6, 9, 12],
                            labels=['Winter', 'Spring', 'Summer', 'Fall'])

        # Encode categorical variables
        df['CategoryCode'] = self.category_encoder.fit_transform(df['Category'])
        df['SubcategoryCode'] = self.subcategory_encoder.fit_transform(df['Subcategory'])

        # Calculate financial metrics
        monthly_income = df[df['Income'] > 0].groupby(
            [df['Year'], df['Month']])['Income'].sum().reset_index()
        monthly_income.columns = ['Year', 'Month', 'MonthlyIncome']

        monthly_expenses = df.groupby(
            [df['Year'], df['Month']])['Expenses'].sum().reset_index()
        monthly_expenses.columns = ['Year', 'Month', 'MonthlyExpenses']

        # Merge monthly metrics back
        df = pd.merge(df, monthly_income, on=['Year', 'Month'], how='left')
        df = pd.merge(df, monthly_expenses, on=['Year', 'Month'], how='left')

        # Calculate savings and ratios
        df['MonthlySavings'] = df['MonthlyIncome'] - df['MonthlyExpenses']
        df['SavingsRate'] = (df['MonthlySavings'] / df['MonthlyIncome']).fillna(0)
        df['ExpenseRatio'] = (df['Expenses'] / df['MonthlyIncome']).fillna(0)

        # Calculate moving averages
        df['Income_MA7'] = df['Income'].rolling(window=7).mean()
        df['Expenses_MA7'] = df['Expenses'].rolling(window=7).mean()

        # Store data statistics
        self.data_stats = {
            'num_records': len(df),
            'date_range': f"{df['Date'].min().strftime('%Y-%m-%d')} to {df['Date'].max().strftime('%Y-%m-%d')}",
            'total_income': df['Income'].sum(),
            'total_expenses': df['Expenses'].sum(),
            'total_savings': df['Income'].sum() - df['Expenses'].sum(),
            'avg_monthly_income': df['MonthlyIncome'].mean(),
            'avg_monthly_expenses': df['MonthlyExpenses'].mean(),
            'avg_monthly_savings': df['MonthlySavings'].mean(),
            'categories': df['Category'].unique().tolist(),
            'subcategories': df['Subcategory'].unique().tolist()
        }

        return df

    def prepare_features(self, data):
        """Prepare features for modeling"""
        # Select features
        features = ['Month', 'DayOfMonth', 'DayOfWeek', 'IsWeekend',
                   'CategoryCode', 'SubcategoryCode', 'MonthlyIncome',
                   'Income_MA7', 'Expenses_MA7', 'ExpenseRatio']
        target = 'Expenses'

        X = data[features].copy()
        y = data[target].copy()

        # Handle missing values
        X = X.fillna(0)  # Replace NaN with 0 for financial features
        y = y.fillna(0)  # Replace NaN with 0 for target variable

        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=features)

        return X_scaled, y

    def calculate_accuracy_percentage(self, y_true, y_pred):
        """Calculate accuracy percentage based on prediction error"""
        percentage_error = np.abs((y_true - y_pred) / y_true) * 100
        accuracy = 100 - np.mean(percentage_error)
        return accuracy

    def evaluate_model(self, model, X_train, X_test, y_train, y_test, model_name):
        """Evaluate a single model's performance"""
        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        accuracy = self.calculate_accuracy_percentage(y_test, y_pred)

        # Perform cross-validation
        cv_scores = cross_val_score(model, X_train, y_train, cv=5)
        cv_accuracy = np.mean(cv_scores) * 100

        return {
            'model_name': model_name,
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2': r2,
            'accuracy': accuracy,
            'cv_accuracy': cv_accuracy,
            'cv_std': cv_scores.std() * 100,
            'predictions': y_pred
        }

    def train_model(self, data):
        """Train all models and evaluate their performance"""
        # Prepare features
        X, y = self.prepare_features(data)

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Evaluate individual models
        models = {
            'Random Forest': self.rf_model,
            'Gradient Boosting': self.gb_model,
            'XGBoost': self.xgb_model,
            'Ensemble': self.ensemble
        }

        for name, model in models.items():
            self.model_performances[name] = self.evaluate_model(
                model, X_train, X_test, y_train, y_test, name
            )

        # Calculate feature importance from Random Forest
        self.rf_model.fit(X, y)
        self.feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': self.rf_model.feature_importances_
        }).sort_values('importance', ascending=False)

        return X_test, y_test, self.model_performances['Ensemble']['predictions']

    def visualize_results(self, data, X_test, y_test, y_pred):
        """Create visualizations of the analysis"""
        # Set style
        plt.style.use('default')

        # 1. Monthly Income, Expenses, and Savings Trends
        monthly_summary = data.groupby('Month').agg({
            'MonthlyIncome': 'mean',
            'MonthlyExpenses': 'mean',
            'MonthlySavings': 'mean'
        }).reset_index()

        plt.figure(figsize=(12, 6))
        plt.plot(monthly_summary['Month'], monthly_summary['MonthlyIncome'],
                label='Income', color='green', marker='o')
        plt.plot(monthly_summary['Month'], monthly_summary['MonthlyExpenses'],
                label='Expenses', color='red', marker='s')
        plt.plot(monthly_summary['Month'], monthly_summary['MonthlySavings'],
                label='Savings', color='blue', marker='^')
        plt.title('Monthly Financial Trends')
        plt.xlabel('Month')
        plt.ylabel('Amount ($)')
        plt.xticks(monthly_summary['Month'], calendar.month_abbr[1:13], rotation=45)
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('monthly_trends.png')
        plt.close()

        # 2. Expense Distribution by Category
        plt.figure(figsize=(12, 6))
        category_expenses = data.groupby('Category')['Expenses'].sum().sort_values(ascending=True)
        plt.barh(category_expenses.index, category_expenses.values, color='skyblue')
        plt.title('Total Expenses by Category')
        plt.xlabel('Total Expenses ($)')
        plt.ylabel('Category')
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('category_expenses.png')
        plt.close()

        # 3. Feature Importance
        plt.figure(figsize=(12, 6))
        plt.barh(self.feature_importance['feature'],
                self.feature_importance['importance'], color='green')
        plt.title('Feature Importance (Random Forest)')
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        plt.close()

        # 4. Model Accuracy Comparison
        plt.figure(figsize=(10, 6))
        model_names = list(self.model_performances.keys())
        accuracies = [perf['accuracy'] for perf in self.model_performances.values()]
        cv_accuracies = [perf['cv_accuracy'] for perf in self.model_performances.values()]

        x = np.arange(len(model_names))
        width = 0.35

        plt.bar(x - width/2, accuracies, width, label='Test Accuracy', color='skyblue')
        plt.bar(x + width/2, cv_accuracies, width, label='Cross-Validation Accuracy', color='lightgreen')

        plt.title('Model Accuracy Comparison')
        plt.xlabel('Models')
        plt.ylabel('Accuracy (%)')
        plt.xticks(x, model_names, rotation=45)
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('model_accuracy.png')
        plt.close()

        # 5. Interactive Visualizations
        # Monthly Expenses by Category
        fig = px.bar(data.groupby(['Month', 'Category'])['Expenses'].sum().reset_index(),
                    x='Month', y='Expenses', color='Category', barmode='group',
                    title='Monthly Expenses by Category')
        fig.write_html('monthly_category_expenses.html')

        # Expense Patterns
        fig = px.scatter(data, x='MonthlyIncome', y='Expenses', color='Category',
                        size='Expenses', hover_data=['Subcategory', 'Date'],
                        title='Expense Patterns by Category')
        fig.write_html('expense_patterns.html')

        # Actual vs Predicted
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=y_test,
            y=self.model_performances['Ensemble']['predictions'],
            mode='markers',
            name='Predictions',
            marker=dict(color='purple', size=8, opacity=0.6)
        ))
        fig.add_trace(go.Scatter(
            x=[y_test.min(), y_test.max()],
            y=[y_test.min(), y_test.max()],
            mode='lines',
            name='Perfect Prediction',
            line=dict(color='red', dash='dash')
        ))
        fig.update_layout(
            title='Actual vs Predicted Expenses (Ensemble Model)',
            xaxis_title='Actual Expenses',
            yaxis_title='Predicted Expenses',
            showlegend=True
        )
        fig.write_html('prediction_analysis.html')

    def generate_recommendations(self, data):
        """Generate personalized financial recommendations"""
        recommendations = []

        # Calculate metrics
        avg_savings_rate = data['SavingsRate'].mean()
        avg_expense_ratio = data['ExpenseRatio'].mean()
        expense_volatility = data.groupby('Category')['Expenses'].std() / data.groupby('Category')['Expenses'].mean()
        highest_expense_categories = data.groupby('Category')['Expenses'].sum().nlargest(3)
        seasonal_expenses = data.groupby(['Season', 'Category'])['Expenses'].mean()

        # Basic recommendations
        if avg_savings_rate < 0.2:
            recommendations.append("Consider increasing your savings rate to at least 20% of your income")

        if avg_expense_ratio > 0.7:
            recommendations.append("Your expenses are high relative to income. Look for areas to cut costs")

        # Category-specific recommendations
        for category, volatility in expense_volatility.items():
            if volatility > 0.5:
                recommendations.append(f"Your {category} expenses show high variability. Consider budgeting more consistently")

        # Seasonal recommendations
        for season in ['Winter', 'Spring', 'Summer', 'Fall']:
            if season in seasonal_expenses.index:
                high_season_expenses = seasonal_expenses[season].nlargest(1)
                recommendations.append(f"Your highest expense in {season} is {high_season_expenses.index[0]} " +
                                    f"(${high_season_expenses.values[0]:.2f} on average)")

        # Top expense categories
        recommendations.append("\nTop 3 expense categories:")
        for category, amount in highest_expense_categories.items():
            recommendations.append(f"- {category}: ${amount:.2f}")

        return recommendations

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import os

class FinancialAnalyzer:
    def __init__(self):
        self.rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
        self.gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
        self.xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
        self.model_performances = {}

    def load_and_prepare_data(self, filepath):
        df = pd.read_csv(filepath)
        df['Date'] = pd.to_datetime(df['Date'])
        df['Month'] = df['Date'].dt.month
        df['Year'] = df['Date'].dt.year
        df = df.dropna()
        df['Savings'] = df['Income'] - df['Expenses']
        return df

    def train_model(self, df):
        features = ['Income', 'Expenses', 'Month', 'Year']
        target = 'Savings'
        X = df[features]
        y = df[target]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        models = {
            'Random Forest': self.rf_model,
            'Gradient Boosting': self.gb_model,
            'XGBoost': self.xgb_model
        }

        for name, model in models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            test_r2 = r2_score(y_test, y_pred) * 100
            cv_scores = cross_val_score(model, X, y, cv=5)
            cv_r2 = np.mean(cv_scores) * 100
            self.model_performances[name] = {
                'accuracy': test_r2,
                'cv_accuracy': cv_r2
            }

        return X_test, y_test, y_pred

    def visualize_results(self, df, X_test, y_test, y_pred):
        plt.figure(figsize=(8, 4))
        df.groupby('Month')['Savings'].mean().plot(kind='line', marker='o')
        plt.title("Average Monthly Savings")
        plt.xlabel("Month")
        plt.ylabel("Savings")
        plt.tight_layout()
        plt.savefig("monthly_trends.png")
        plt.close()

        plt.figure(figsize=(8, 4))
        df.groupby('Category')['Expenses'].sum().plot(kind='bar')
        plt.title("Total Expenses by Category")
        plt.ylabel("Amount")
        plt.tight_layout()
        plt.savefig("category_expenses.png")
        plt.close()

        importances = self.rf_model.feature_importances_
        feature_names = ['Income', 'Expenses', 'Month', 'Year']
        plt.figure(figsize=(6, 4))
        sns.barplot(x=importances, y=feature_names)
        plt.title("Feature Importances (Random Forest)")
        plt.tight_layout()
        plt.savefig("feature_importance.png")
        plt.close()

        plt.figure(figsize=(6, 4))
        plt.scatter(y_test, y_pred, alpha=0.7)
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
        plt.xlabel("Actual Savings")
        plt.ylabel("Predicted Savings")
        plt.title("Actual vs Predicted Savings")
        plt.tight_layout()
        plt.savefig("model_accuracy.png")
        plt.close()

    def generate_recommendations(self, df):
        avg_savings = df['Savings'].mean()
        avg_expenses = df['Expenses'].mean()
        avg_income = df['Income'].mean()
        savings_rate = avg_savings / avg_income if avg_income else 0

        recs = []
        if savings_rate < 0.2:
            recs.append("⚠️ Your savings rate is below 20%. Consider reducing discretionary expenses.")
        else:
            recs.append("✅ Good savings rate! Keep up the disciplined budgeting.")

        if avg_expenses > 0.8 * avg_income:
            recs.append("⚠️ Expenses exceed 80% of income. Review subscriptions, dining, and utilities.")

        recs.append(f"💰 Your average monthly savings is ₹{avg_savings:.2f}")
        recs.append(f"📈 Try to increase it to at least 30% of your income (₹{0.3*avg_income:.2f})")

        return recs

# Gradio function wrapper
analyzer = FinancialAnalyzer()

def run_analysis():
    data = analyzer.load_and_prepare_data('personal_finance_large.csv')
    X_test, y_test, y_pred = analyzer.train_model(data)
    analyzer.visualize_results(data, X_test, y_test, y_pred)
    recommendations = analyzer.generate_recommendations(data)
    accuracy_lines = []
    for name, perf in analyzer.model_performances.items():
        accuracy_lines.append(f"{name}: Test Accuracy = {perf['accuracy']:.2f}%, CV Accuracy = {perf['cv_accuracy']:.2f}%")
    accuracy_summary = '\n'.join(accuracy_lines)
    return (
        'monthly_trends.png',
        'category_expenses.png',
        'feature_importance.png',
        'model_accuracy.png',
        accuracy_summary,
        '\n'.join(recommendations)
    )

iface = gr.Interface(
    fn=run_analysis,
    inputs=None,
    outputs=[
        gr.Image(label="Monthly Trends"),
        gr.Image(label="Category Expenses"),
        gr.Image(label="Feature Importance"),
        gr.Image(label="Model Accuracy"),
        gr.Textbox(label="Model Accuracies", lines=6),
        gr.Textbox(label="Recommendations", lines=10)
    ],
    title="📊 Personal Financial Analyzer (Large Dataset)",
    description="Click the button below to analyze your large personal finance data and get insights, visualizations, model accuracies, and recommendations."
)

iface.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://58a0d53e8056a4185b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [13]:
iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4b29a43a8e3875f684.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


