# Bollywood Data Exploration
This notebook contains a small exploratory analysis and instructions to run the Streamlit dashboard '🎥 Balancing Fame and Talent in Bollywood'.

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
# Load the provided dataset (handle common NULL tokens)
df = pd.read_csv('data/BollywoodActorRanking (2).csv', na_values=['NULL','null'])
# Normalize column names: actorName -> actor
if 'actorName' in df.columns and 'actor' not in df.columns:
    df = df.rename(columns={'actorName':'actor'})
# Basic cleaning
df = df.drop_duplicates()
# Ensure numeric types
for c in ['movieCount','ratingSum','googleHits']:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')
# Drop rows missing essential fields
df = df.dropna(subset=['actor','movieCount','ratingSum']).copy()
# Remove non-positive movie counts
df = df[df['movieCount'] > 0].copy()
# Compute avgRating and create normalized columns if missing
df['avgRating'] = df['ratingSum'] / df['movieCount']
# Ensure normalized columns are 0-1 scaled; if present on 0-10 scale, rescale
for col in ['normalizedMovieRank','normalizedGoogleRank','normalizedRating']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        if df[col].max() > 1.1:
            df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    else:
        # create proxy normalized columns
        if col == 'normalizedMovieRank':
            base = df['movieCount']
        elif col == 'normalizedGoogleRank' and 'googleHits' in df.columns:
            base = df['googleHits']
        elif col == 'normalizedRating':
            base = df['avgRating']
        else:
            base = None
        if base is not None:
            df[col] = (base - base.min()) / (base.max() - base.min())
        else:
            df[col] = 0.0
# Fill NaNs in normalized cols with column mean
for col in ['normalizedMovieRank','normalizedGoogleRank','normalizedRating']:
    df[col] = df[col].fillna(df[col].mean())
# KPIs
df['fameScore'] = 0.5 * df['normalizedGoogleRank'] + 0.5 * df['normalizedMovieRank']
df['talentScore'] = df['normalizedRating']
df['balanceScore'] = 1 - (df['fameScore'] - df['talentScore']).abs()
df.head()

Unnamed: 0,actorId,actor,movieCount,ratingSum,normalizedMovieRank,googleHits,normalizedGoogleRank,normalizedRating,avgRating,fameScore,talentScore,balanceScore
0,373,Aamir Khan,11,1170,0.938736,2460000,0.704759,1.0,106.363636,0.821747,1.0,0.821747
1,374,Shah Rukh Khan,23,2000,0.727788,2670000,0.764953,0.914082,86.956522,0.746371,0.914082,0.832288
2,375,Salman Khan,36,2340,0.48913,3490000,1.0,0.912023,65.0,0.744565,0.912023,0.832542
3,376,Katrina Kaif,17,1640,0.831202,2120000,0.6073,0.883169,96.470588,0.719251,0.883169,0.836082
4,377,Deepika Padukone,16,1080,0.516304,3000000,0.859546,0.847462,67.5,0.687925,0.847462,0.840463


In [2]:
# Basic cleaning checks and summary
print('rows, cols:', df.shape)
print('nulls (per column):')
print(df.isnull().sum())
print('duplicates:', df.duplicated().sum())
df.describe(include='all')

rows, cols: (299, 12)
nulls (per column):
actorId                 0
actor                   0
movieCount              0
ratingSum               0
normalizedMovieRank     0
googleHits              0
normalizedGoogleRank    0
normalizedRating        0
avgRating               0
fameScore               0
talentScore             0
balanceScore            0
dtype: int64
duplicates: 0


Unnamed: 0,actorId,actor,movieCount,ratingSum,normalizedMovieRank,googleHits,normalizedGoogleRank,normalizedRating,avgRating,fameScore,talentScore,balanceScore
count,299.0,299,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
unique,,299,,,,,,,,,,
top,,Pankaj Berry,,,,,,,,,,
freq,,1,,,,,,,,,,
mean,522.0,,10.301003,421.939799,0.196552,376691.5,0.170437,0.21345,38.082806,0.183495,0.21345,0.91531
std,86.458082,,8.495737,447.144838,0.180774,666234.1,0.172225,0.194136,16.631251,0.149054,0.194136,0.022171
min,373.0,,4.0,80.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.821747
25%,447.5,,5.0,150.0,0.054348,0.0,0.051788,0.078664,25.0,0.085219,0.078664,0.912111
50%,522.0,,7.0,260.0,0.152174,76900.0,0.170437,0.164459,34.0,0.133599,0.164459,0.919357
75%,596.5,,12.5,500.0,0.291227,385500.0,0.170437,0.282362,46.792929,0.236412,0.282362,0.928264


In [3]:
# Top and bottom actors by fame/talent/balance
print('Top 10 by fameScore')
display(df.sort_values('fameScore', ascending=False)[['actor','fameScore','talentScore','avgRating']].head(10))
print('Top 10 by talentScore')
display(df.sort_values('talentScore', ascending=False)[['actor','talentScore','fameScore','avgRating']].head(10))
print('Top 10 by balanceScore (most balanced)')
display(df.sort_values('balanceScore', ascending=False)[['actor','balanceScore','fameScore','talentScore']].head(10))

Top 10 by fameScore


Unnamed: 0,actor,fameScore,talentScore,avgRating
0,Aamir Khan,0.821747,1.0,106.363636
1,Shah Rukh Khan,0.746371,0.914082,86.956522
2,Salman Khan,0.744565,0.912023,65.0
3,Katrina Kaif,0.719251,0.883169,96.470588
4,Deepika Padukone,0.687925,0.847462,67.5
5,Hrithik Roshan,0.651349,0.805771,70.0
6,Priyanka Chopra,0.640132,0.792984,55.277778
7,Kareena Kapoor,0.635627,0.78785,59.459459
8,Ranbir Kapoor,0.597414,0.744293,80.909091
9,Kajol,0.553834,0.694618,75.0


Top 10 by talentScore


Unnamed: 0,actor,talentScore,fameScore,avgRating
0,Aamir Khan,1.0,0.821747,106.363636
1,Shah Rukh Khan,0.914082,0.746371,86.956522
2,Salman Khan,0.912023,0.744565,65.0
3,Katrina Kaif,0.883169,0.719251,96.470588
4,Deepika Padukone,0.847462,0.687925,67.5
5,Hrithik Roshan,0.805771,0.651349,70.0
6,Priyanka Chopra,0.792984,0.640132,55.277778
7,Kareena Kapoor,0.78785,0.635627,59.459459
8,Ranbir Kapoor,0.744293,0.597414,80.909091
9,Kajol,0.694618,0.553834,75.0


Top 10 by balanceScore (most balanced)


Unnamed: 0,actor,balanceScore,fameScore,talentScore
30,Jacqueline Fernandez,0.975589,0.520001,0.49559
44,Aditya Roy Kapoor,0.962287,0.424893,0.38718
60,Rajkummar Rao,0.954686,0.370545,0.325231
64,Supriya Pathak,0.952787,0.356958,0.309744
65,Nawazuddin Siddiqui,0.952787,0.356958,0.309744
80,Shraddha Kapoor,0.947085,0.316197,0.263282
82,Manish Chaudhary,0.946452,0.311668,0.25812
86,Pavan Malhotra,0.945186,0.30261,0.247796
95,Mohammed Zeeshan Ayyub,0.943285,0.289023,0.232308
97,Genelia D'Souza,0.942651,0.284494,0.227146


In [4]:
# Correlation
from scipy.stats import pearsonr
corr, p = pearsonr(df['fameScore'], df['talentScore'])
print(f'Pearson r = {corr:.3f}, p = {p:.3e}')
# Scatter plot
fig = px.scatter(df, x='fameScore', y='talentScore', hover_data=['actor','avgRating'], color='balanceScore', color_continuous_scale='Inferno')
fig.update_layout(width=800, height=500, title='Fame vs Talent (colored by balance)')
fig.show()
# Bar charts: top 10 comparison
fig1 = px.bar(df.sort_values('fameScore', ascending=False).head(10), x='fameScore', y='actor', orientation='h', title='Top 10 by Fame')
fig1.show()
fig2 = px.bar(df.sort_values('talentScore', ascending=False).head(10), x='talentScore', y='actor', orientation='h', title='Top 10 by Talent')
fig2.show()

Pearson r = 0.918, p = 3.410e-121


## Run the Streamlit App
To run the dashboard locally, install dependencies from `requirements.txt` and then run:
```
streamlit run streamlit_app.py
```