In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
plt.style.use('default')
sns.set_palette("husl")

In [None]:
df = pd.read_csv('/Users/nishant/Desktop/Datasets/Phonepe/aggregated/Phonepe/phonepe_transaction.csv')

In [None]:
# Convert all column names to lowercase for consistency
df.columns = df.columns.str.lower()
print("Column names converted to lowercase:")
print(df.columns)

In [None]:
print(df.columns)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isnull().sum() # Check for missing values

Top 10 States by Transaction Amount and Count:

In [None]:
# Filter out 'All States'
df_filtered = df[df['state'] != 'All States']
top_10_states_amount = df_filtered.groupby('state')['transaction_amount'].sum().sort_values(ascending=False).head(10)
# Convert the transaction amount to crores
top_10_states_amount_crores = (top_10_states_amount / 1e7).round(2)
print("Top 10 States by Transaction Amount (in Crores):")
print(top_10_states_amount_crores)


In [None]:
# 2. Visualize the top 10 states by transaction amount
plt.figure(figsize=(12, 6))
bars = plt.bar(top_10_states_amount_crores.index, top_10_states_amount_crores.values, 
               color='skyblue', edgecolor='navy', alpha=0.7)
plt.title('Top 10 States by Transaction Amount', fontsize=16, fontweight='bold')
plt.xlabel('State', fontsize=12)
plt.ylabel('Total Transaction Amount (in Crores)', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

# Add value labels on top of bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
             f'{height:.1f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# Group by quarter and sum the transaction amount
quarterly_amount = df_filtered.groupby('quarter')['transaction_amount'].sum().reset_index()

# Create a bar chart to visualize the quarterly transaction amount
fig_quarterly_amount = px.bar(quarterly_amount,
                            x='quarter',
                            y='transaction_amount',
                            title='Transaction Amount by Quarter',
                            labels={'transaction_amount': 'Total Transaction Amount', 'quarter': 'Quarter'},
                            color='quarter')
fig_quarterly_amount.show()

In [None]:
# Group by quarter and sum the transaction count
quarterly_count = df_filtered.groupby('quarter')['transaction_count'].sum().reset_index()

# Create a bar chart to visualize the quarterly transaction count
fig_quarterly_count = px.bar(quarterly_count,
                           x='quarter',
                           y='transaction_count',
                           title='Transaction Count by Quarter',
                           labels={'transaction_count': 'Total Transaction Count', 'quarter': 'Quarter'},
                           color='quarter')
fig_quarterly_count.show()

4. Most Common Transaction Types


In [None]:
# Group by transaction category and sum the transaction count
transaction_category_counts = df_filtered.groupby('transaction_category')['transaction_count'].sum().reset_index()

# Create a pie chart to visualize the distribution of transaction categories
fig_pie_categories = px.pie(transaction_category_counts,
                       names='transaction_category',
                       values='transaction_count',
                       title='Distribution of Transaction Categories',
                       hole=0.05)
fig_pie_categories.update_traces(textposition='inside', textinfo='percent+label')
fig_pie_categories.show()                       

5. Distribution of Transaction Types in a Specific State


In [None]:
# Choose a state to analyze (with correct capitalization)
state_to_analyze = 'Bihar'

# Filter the DataFrame for the selected state
state_df = df_filtered[df_filtered['state'] == state_to_analyze]

# Group by the correct column name: 'transaction_category'
state_transaction_counts = state_df.groupby('transaction_category')['transaction_count'].sum().reset_index()

# Create a pie chart
fig_state_pie = px.pie(state_transaction_counts,
                       names='transaction_category',
                       values='transaction_count',
                       title=f'Distribution of Transaction Types in {state_to_analyze}',
                       hole=0.05)

fig_state_pie.update_traces(textposition='inside', textinfo='percent+label')
fig_state_pie.show()

6. Transaction Amount vs. Transaction Count by State

In [None]:
# Group by state to get the total transaction amount and count
state_summary = df_filtered.groupby('state').agg({
    'transaction_amount':'sum',
    'transaction_count':'sum'
}).reset_index()

In [None]:
# create a bar chart to visualize the relationship
fig_bar_comparison = px.bar(state_summary,
                            x='state',
                            y=['transaction_amount', 'transaction_count'],
                            title='Transaction Amount vs. Transaction Count by State',
                            labels={'value': 'Transaction Amount / Count', 'variable': 'Metric'},
                            barmode='group',# This creates the side-by-side bars
                            log_y=True)  # Use log scale for better visibility
fig_bar_comparison.update_layout(xaxis={'categoryorder':'total descending'}) # Order states by the total value
fig_bar_comparison.show()

7. Average Transaction Value (ATV) by State

In [None]:
state_summary['average_transaction_value'] = state_summary['transaction_amount'] / state_summary['transaction_count']

# sort the state by ATV in desc order
atv_sorted = state_summary.sort_values(by='average_transaction_value',ascending=False)


In [None]:
# Get the top 10 and bottom 10 states by ATV
top_10_atv = atv_sorted.head(10)
bottom_10_atv = atv_sorted.tail(10)

# Combine them into a single dataframe for plotting
atv_comparison = pd.concat([top_10_atv, bottom_10_atv])

# Create the bar chart
fig_atv_bar = px.bar(atv_comparison,
                     x='state',
                     y='average_transaction_value',
                     color='average_transaction_value',
                     color_continuous_scale='Plasma',
                     title='Top and Bottom 10 States by Average Transaction Value (ATV)',
                     labels={'average_transaction_value': 'Average Transaction Value', 'state': 'State'})

fig_atv_bar.update_layout(xaxis={'categoryorder':'total descending'})
fig_atv_bar.show()

Correlation Analysis of Key Metrics

In [None]:
# --- Calculate the Correlation Matrix ---

# Select only the numerical columns for correlation analysis
correlation_data = state_summary[['transaction_amount', 'transaction_count', 'average_transaction_value']]

# Calculate the correlation matrix
correlation_matrix = correlation_data.corr()

# Print the correlation matrix for reference
print("Correlation Matrix:")
print(correlation_matrix)


# --- Visualize the Correlation Matrix with a Heatmap ---

# Create the heatmap
fig_corr_heatmap = px.imshow(
    correlation_matrix,
    text_auto=True,  # Automatically display the correlation values on the heatmap
    aspect="auto",
    color_continuous_scale='RdBu_r', # Use a diverging color scale for better interpretation
    title='Correlation Matrix of Transaction Metrics'
)

fig_corr_heatmap.show()