In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<div style="
    background-color: #f0f4f8; 
    padding: 30px; 
    border-radius: 15px; 
    font-family: Arial, sans-serif; 
    position: relative;
    overflow: hidden;
">
   

 <!-- Content -->
<div style="position: relative; z-index: 1;">
        <h1 style="color:#2c3e50; text-align:center;">üé¨ Welcome to the Cinema Audience Forecasting Notebook</h1>
        <p style="font-size:16px; color:#34495e; text-align:center; margin-top:10px;">
            In this notebook, we explore and forecast <strong>daily cinema audience counts</strong> across multiple theatres using both online and point-of-sale booking data.
        </p>

<hr style="margin:25px 0; border:1px solid #dfe6e9;">

 <h2 style="color:#2c3e50; text-align:center;">üìÇ Challenge Overview</h2>
        <p style="color:#34495e; font-size:15px;">
            The goal of the <strong>Cinema Audience Forecasting Challenge</strong> is to accurately predict daily theatre audience counts using historical booking and visit data.
        </p>

<h2 style="color:#2c3e50; text-align:center;">üóÇÔ∏è Provided Datasets</h2>
        <ul style="color:#34495e; font-size:15px;">
            <li><strong>cinePOS_theaters.csv</strong> ‚Äì CinePOS theatre info</li>
            <li><strong>booknow_theaters.csv</strong> ‚Äì BookNow theatre info</li>
            <li><strong>movie_theater_id_relation.csv</strong> ‚Äì Mapping between BookNow and CinePOS theatres</li>
            <li><strong>cinePOS_booking.csv</strong> ‚Äì CinePOS bookings</li>
            <li><strong>booknow_booking.csv</strong> ‚Äì BookNow bookings</li>
            <li><strong>booknow_visits.csv</strong> ‚Äì Daily audience counts</li>
            <li><strong>date_info.csv</strong> ‚Äì Calendar information</li>
            <li><strong>sample_submission.csv</strong> ‚Äì Submission format</li>
        </ul>
<h4 style="color:#2c3e50; text-align:center;">We will mostly use booknow_visits dataset as it provides the most complete information</h4>
<h2 style="color:#2c3e50; text-align:center; margin-top:30px;">üöÄ Let's start exploring and forecasting!</h2>
    </div>
</div>


In [None]:
bnbook = pd.read_csv(f"/kaggle/input/Cinema_Audience_Forecasting_challenge/booknow_booking/booknow_booking.csv")
bntheater = pd.read_csv(f"/kaggle/input/Cinema_Audience_Forecasting_challenge/booknow_theaters/booknow_theaters.csv")
bnvisit = pd.read_csv(f"/kaggle/input/Cinema_Audience_Forecasting_challenge/booknow_visits/booknow_visits.csv")
cpbook = pd.read_csv(f"/kaggle/input/Cinema_Audience_Forecasting_challenge/cinePOS_booking/cinePOS_booking.csv")
cptheater = pd.read_csv(f"/kaggle/input/Cinema_Audience_Forecasting_challenge/cinePOS_theaters/cinePOS_theaters.csv")
dateinfo = pd.read_csv(f"/kaggle/input/Cinema_Audience_Forecasting_challenge/date_info/date_info.csv")
mtr = pd.read_csv(f"/kaggle/input/Cinema_Audience_Forecasting_challenge/movie_theater_id_relation/movie_theater_id_relation.csv")
ss = pd.read_csv(f"/kaggle/input/Cinema_Audience_Forecasting_challenge/sample_submission/sample_submission.csv")

In [None]:
#libraries:

import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
ds = {
    'bnbook': bnbook,
    'bntheater': bntheater,
    'bnvisit': bnvisit,
    'cpbook': cpbook,
    'cptheater': cptheater,
    'dateinfo': dateinfo,
    'mtr': mtr,
    'ss': ss
}

In [None]:
insights = {
    "bnbook": "Bookings dataset with show & booking timestamps. 68k rows.",
    "bntheater": "Theater metadata; missing book_theater_id for more than 50% rows.",
    "bnvisit": "Visit-level audience counts; 214k rows across show dates.",
    "cpbook": "CinePOS booking data; large dataset with 1.6M rows.",
    "cptheater": "CinePOS theater metadata; latitude/longitude missing in many rows.",
    "dateinfo": "Calendar mapping of show_date to day_of_week.",
    "mtr": "Mapping table linking book_theater_id and cine_theater_id.",
    "ss": "Synthetic dataset containing ID + audience_count for submission.",
}


In [None]:
def inspect_datasets(ds):
    for name, df in ds.items():
        print("\n" + "="*120)
        print(f" Dataset: {name} {df.shape}")
        print(f" Columns: {list(df.columns)}\n")
    
        display(df.head(), df.info())
    
        print("\nüìå INSIGHT:", insights[name])
        print("="*120)

  

In [None]:
# --- bnbook ---
bnbook.rename(columns={'show_datetime': 'show_date'}, inplace=True)

bnbook['show_date'] = pd.to_datetime(bnbook['show_date'], errors='coerce')
bnbook['booking_datetime'] = pd.to_datetime(bnbook['booking_datetime'], errors='coerce')

# --- bnvisit ---
bnvisit['show_date'] = pd.to_datetime(bnvisit['show_date'], errors='coerce')

# --- cpbook ---
cpbook.rename(columns={'show_datetime': 'show_date'}, inplace=True)

cpbook['show_date'] = pd.to_datetime(cpbook['show_date'], errors='coerce')
cpbook['booking_datetime'] = pd.to_datetime(cpbook['booking_datetime'], errors='coerce')

# --- dateinfo ---
dateinfo['show_date'] = pd.to_datetime(dateinfo['show_date'], errors='coerce')


In [None]:
inspect_datasets(ds)


<h1 style="color:#2D89C1; font-family:Arial, sans-serif; text-align:center;">
    üìä Analysing BOOKNOW VISIT
</h1>


In [None]:
bnvisit.book_theater_id.value_counts().plot(kind='bar', figsize=(20, 5))
plt.title("Frequency of book_theater_id")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


It can be seen that the data points for all the theaters aren't consistent,i.e., some theaters have more than 400, 1/3rd of the theaters have more than 200, most have more than 100 but a few have less than that


In [None]:
theaters_to_plot = [
    'book_00001', 'book_00013', 'book_00014', 'book_00023',
    'book_00024', 'book_00044', 'book_00056', 'book_00070', 'book_00105',
    'book_00122', 'book_00131', 'book_00152', 'book_00156', 'book_00228'
]
for theater in theaters_to_plot:
    df = bnvisit[bnvisit['book_theater_id'] == theater].sort_values('show_date')
    if df.empty:
        continue
    
    plt.figure(figsize=(8, 4))
    plt.plot(df['show_date'], df['audience_count'], linewidth=1)
    plt.scatter(df['show_date'], df['audience_count'], s=12)
    plt.title(f"Audience Count Over Time ‚Äî {theater}")
    plt.xlabel("Show Date")
    plt.ylabel("Audience Count")
    plt.tight_layout()
    plt.show()


Initially, graphs were created for every booking theater ID. The following examples highlight the key patterns observed, which are representative of the broader dataset.

**Observations:**

1. The graphs reveal the structure and completeness of the data, while also highlighting several inconsistencies.
2. Theater IDs such as **00228, 00152, 00122, 00070, and 00044** show missing data points over certain periods, potentially indicating temporary closures or corrupted/missing records.
3. Some theaters, such as **00105**, contain only one or two data points across the entire timeline, limiting their analytical value.
4. Most graphs resemble **00001, 00013, and 00014**, with data concentrated around the midpoint and occasional extreme values‚Äîpossibly reflecting weekend spikes.
5. A few theaters, including **00023 and 00131**, have data clustered at the lower range with 2‚Äì4 notable outliers. This may be due to mistyped entries, private bookings, or special events.
6. Certain patterns (clearly visible in **00024** and partially in other graphs) suggest an average mid-period dip in audience count, which could be attributed to seasonality, fewer film releases, increased competition, maintenance, or internal operational factors.



<h2 style="color:#2E86C1; font-family:Arial, sans-serif; text-align:center;">
    üìä Analysing BNVisit
</h2>


In [None]:
# -----------------------------
# Step 1: Feature Engineering
# -----------------------------

bnvisit['show_date'] = pd.to_datetime(bnvisit['show_date'])
bnvisit['day'] = bnvisit['show_date'].dt.day
bnvisit['month'] = bnvisit['show_date'].dt.month
bnvisit['year'] = bnvisit['show_date'].dt.year
bnvisit['day_of_week'] = bnvisit['show_date'].dt.dayofweek
bnvisit['weekend'] = bnvisit['day_of_week'].isin([5, 6]).astype(int)

ss['book_theater_id'] = ss['ID'].apply(lambda x: '_'.join(x.split('_')[:-1]))
ss['show_date'] = pd.to_datetime(ss['ID'].apply(lambda x: x.split('_')[-1]), errors='coerce')
ss['day'] = ss['show_date'].dt.day
ss['month'] = ss['show_date'].dt.month
ss['year'] = ss['show_date'].dt.year
ss['day_of_week'] = ss['show_date'].dt.dayofweek
ss['weekend'] = ss['day_of_week'].isin([5, 6]).astype(int)




<h2 style="color:#2E86C1; font-family:Arial, sans-serif; text-align:center;">
    Day of the week-vise Audience count
</h2>


In [None]:
# Map numbers to day names
day_map = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 
           4:'Friday', 5:'Saturday', 6:'Sunday'}
bnvisit['day_name'] = bnvisit['day_of_week'].map(day_map)

# Average audience per day of week
day_avg = bnvisit.groupby('day_name')['audience_count'].mean().reindex([
    'Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'
])

# Plot
plt.figure(figsize=(10,6))
sns.barplot(x=day_avg.index, y=day_avg.values, palette='viridis')
plt.title('Average Audience Count by Day of Week')
plt.ylabel('Average Audience Count')
plt.xlabel('Day of Week')
plt.show()





1> It can be observed that the audience count on weekends is higher than on weekdays, with Sunday recording the highest footfall.
Speculations: Major film releases typically happen on Fridays. People are generally freer on weekends and may plan recreational activities like going to the movies. However, other events scheduled over the weekend could also act as competitors for cinema attendance.

2> Mondays also show a high audience count ‚Äî even higher than Saturdays.
Speculations: This could be due to the trailing effect of new Friday releases. But why is it higher than Saturdays? Could cheaper ticket prices be a factor? Initial box office reports, Instagram buzz, or FOMO from those who watched the film on Sunday might also influence decisions.
Additionally, who is more likely to be free on Mondays ‚Äî students or unemployed individuals?
The sharp drop from Monday to Tuesday is still a question (possibly explained in point 3).

3> Thursdays show a slightly higher audience count.
Speculation: With new movies releasing on Fridays, theaters may begin clearing slots for upcoming films. This could create a ‚Äúlast chance to watch‚Äù effect for movies that are about to be removed.




<h2 style="color:#2E86C1; font-family:Arial, sans-serif; text-align:center;">
    Month-wise Audience count
</h2>


In [None]:
# Average audience per year
month_avg = bnvisit.groupby('month')['audience_count'].mean()

plt.figure(figsize=(10,6))
sns.barplot(x=month_avg.index, y=month_avg.values, palette='coolwarm')
plt.title('Average Audience Count by Month')
plt.ylabel('Average Audience Count')
plt.xlabel('Month')
plt.xticks(range(0,12), ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
plt.show()

Average audience count rises gradually from March, declines through August, and reaches its peak in December.
December peak: Likely driven by the holiday season, with major film releases around Christmas and New Year attracting larger audiences.
Increase from March: No clear driver identified; further analysis is required to determine the underlying factors.


<h3 style="color:#2E86C1; font-family:Arial, sans-serif; text-align:center;">
    weekly averages over booking theaters
</h3>


In [None]:
theater_avg = bnvisit.groupby('book_theater_id')['audience_count'].mean().reset_index()
theater_avg.columns = ['book_theater_id', 'avg_audience']
bnvisit = bnvisit.merge(theater_avg, on='book_theater_id', how='left')
ss = ss.merge(theater_avg, on='book_theater_id', how='left')
ss['avg_audience'].fillna(bnvisit['audience_count'].mean(), inplace=True)


<h2 style="color:#2E86C1; font-family:Arial, sans-serif; text-align:center;">
    Scaling & Encoding
</h2>


In [None]:
le = LabelEncoder()
bnvisit['theatre_encoded'] = le.fit_transform(bnvisit['book_theater_id'])
ss['theatre_encoded'] = ss['book_theater_id'].map(
    dict(zip(bnvisit['book_theater_id'], bnvisit['theatre_encoded']))
).fillna(-1).astype(int)

# Prepare Features & Target
features = ['theatre_encoded', 'day', 'month', 'year', 'day_of_week', 'weekend', 'avg_audience']
X = bnvisit[features]
y = bnvisit['audience_count']
X_test = ss[features]

# Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)


<h2 style="color:#2E86C1; font-family:Arial, sans-serif; text-align:center;">
    Initial model Training & Comparision
</h2>


In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.2, random_state=42, shuffle = False)

# Train Multiple Models

models = {
    'RandomForest': RandomForestRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'ExtraTrees': ExtraTreesRegressor(random_state=42),
    'XGBoost': XGBRegressor(objective='reg:squarederror', random_state=42),
    'LightGBM': lgb.LGBMRegressor(random_state=42,verbose=-1,force_col_wise=True)
}

initial_results = {}

for name, model in models.items():
    print(f"\n‚è≥ Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    r2 = r2_score(y_valid, y_pred)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    initial_results[name] = r2
    print(f"‚úÖ {name} | Validation R¬≤: {r2:.4f} | RMSE: {rmse:.2f}")

<h3 style="color:#2E86C1; font-family:Arial, sans-serif; text-align:center;">
    Hyperparameter Tuning

</h3>


In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
top3 = sorted(initial_results, key=initial_results.get, reverse=True)[:3]
print("\nüèÜ Top 3 models for hyperparameter tuning:", top3)

# -----------------------------
# Define grids
# -----------------------------
param_grids = {
    'RandomForest': {
        'n_estimators': [200, 300, 400, 500],       
        'max_depth': [5, 7, 10, None],              
        'min_samples_split': [2, 5, 10],            
        'min_samples_leaf': [1, 2, 4],              
        'max_features': ['auto', 'sqrt', 'log2'],   
        'bootstrap': [True, False]
    },
    'LightGBM': {
        'n_estimators': [200, 300, 500, 800],
        'learning_rate': [0.005, 0.01, 0.05, 0.1],
        'num_leaves': [31, 64, 128, 256],
        'max_depth': [-1, 5, 7, 9, 12],
        'subsample': [0.6, 0.7, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 1.0],
        'min_child_samples': [20, 40, 80, 160],
        'reg_alpha': [0, 0.01, 0.1, 1],
        'reg_lambda': [0, 0.01, 0.1, 1]
    },
    'XGBoost': {
        'n_estimators':[200,300],
        'learning_rate':[0.01,0.05,0.1],
        'max_depth':[3,5,7],
        'subsample':[0.7,0.8,1.0],
        'colsample_bytree':[0.7,0.8,1.0]
    }
}

# Hyperparameter tuning

best_models = {}

for name in top3:
    print(f"\n RandomizedSearchCV tuning {name}...")
    model = models[name]
    grid = param_grids[name]

    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=grid,
        n_iter=40,         
        scoring='r2',
        cv=2,
        n_jobs=-1,
        verbose=1,
        random_state=42
    )
    search.fit(X_train, y_train)

    best_models[name] = search.best_estimator_
    print(f"‚úÖ Best {name} parameters:", search.best_params_)

<h3 style="color:#2E86C1; font-family:Arial, sans-serif; text-align:center;">
    Validation scores after hyperparameter tuning
</h3>


In [None]:
tuned_results = {}
print("\nüìä Validation scores after hyperparameter tuning:")
for name, model in best_models.items():
    y_val_pred = model.predict(X_valid)
    r2 = r2_score(y_valid, y_val_pred)
    rmse = np.sqrt(mean_squared_error(y_valid, y_val_pred))
    tuned_results[name] = r2
    print(f"{name} | R¬≤: {r2:.4f} | RMSE: {rmse:.2f}")


<h3 style="color:#2E86C1; font-family:Arial, sans-serif; text-align:center;">
   Visual comparision of the models
</h3>


In [None]:
plt.figure(figsize=(12,6))
sns.barplot(x=list(initial_results.keys()), y=list(initial_results.values()), palette="viridis")
plt.title("Initial Model R¬≤ Scores (Before Hyperparameter Tuning)")
plt.ylabel("R¬≤ Score")
plt.ylim(0,1)
plt.show()

plt.figure(figsize=(12,6))
x_labels = list(tuned_results.keys())
initial_vals = [initial_results[name] for name in x_labels]
tuned_vals = [tuned_results[name] for name in x_labels]
x = np.arange(len(x_labels))

plt.bar(x-0.2, initial_vals, width=0.4, label="Initial", color="skyblue")
plt.bar(x+0.2, tuned_vals, width=0.4, label="Tuned", color="orange")
plt.xticks(x, x_labels)
plt.ylabel("R¬≤ Score")
plt.title("Top 3 Models: Before vs After Hyperparameter Tuning")
plt.legend()
plt.show()

<h3 style="color:#2E86C1; font-family:Arial, sans-serif; text-align:center;">
    Predicting and Submission 
</h3>


In [None]:
# Pick the best model based on R¬≤ score
best_model_name = max(tuned_results, key=tuned_results.get)
best_model = best_models[best_model_name]

print(f"\nüèÜ Best Model Selected: {best_model_name}")


In [None]:
print("\n Evaluating best model performance...")
y_val_pred = best_model.predict(X_valid)

r2 = r2_score(y_valid, y_val_pred)
rmse = np.sqrt(mean_squared_error(y_valid, y_val_pred))

print(f"{best_model_name} | R¬≤: {r2:.4f} | RMSE: {rmse:.2f}")


In [None]:
print("\n Predicting and saving submission file...")

ss['audience_count'] = best_model.predict(X_test_scaled).round().astype(int)

final = ss[['ID', 'audience_count']]
final.to_csv('best_model_scaled_predictions.csv', index=False)

print("‚úÖ Saved submission to best_model_scaled_predictions.csv")

final
