In [123]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

In [124]:
data = pd.read_csv('generated_data.csv')

In [125]:
X = data.drop(['creator_id', 'actual_attendance', 'num_tickets_sold'], axis=1)
y = data['actual_attendance']

In [126]:
X

Unnamed: 0,event_id,event_type,ticket_price,num_tickets_available,host,is_online,tmp_mid,tmp_min,tmp_max,precipitation,wind_speed
0,960,Sports & Fitness,12,72,Host5,True,18.488613,15.025891,21.951336,38.791795,3.477851
1,666,Business,22,157,Host4,False,25.820405,14.121679,37.519131,6.584357,3.649633
2,188,Performing & Visual Art,29,100,Host4,False,23.295563,19.917428,26.673699,18.679263,19.054128
3,895,Music,10,136,Host5,False,21.350742,17.080426,25.621059,18.753444,10.417700
4,214,Food & Drink,48,181,Host4,True,20.835105,6.324648,35.345562,21.222962,0.599593
...,...,...,...,...,...,...,...,...,...,...,...
495,345,Sports & Fitness,50,194,Host4,False,26.007491,17.563921,34.451062,49.084900,13.589877
496,227,Hobbies,14,118,Host2,True,23.963973,17.637955,30.289990,22.408920,19.571770
497,76,Business,41,117,Host5,False,28.017283,18.008876,38.025690,25.687657,19.477845
498,883,Sports & Fitness,48,63,Host3,True,19.871407,17.453526,22.289288,0.590704,1.020513


In [127]:
label_encoder = LabelEncoder()
X["host"] = label_encoder.fit_transform(X["host"])
X["event_type"] = label_encoder.fit_transform(X["event_type"])

In [128]:
X

Unnamed: 0,event_id,event_type,ticket_price,num_tickets_available,host,is_online,tmp_mid,tmp_min,tmp_max,precipitation,wind_speed
0,960,7,12,72,4,True,18.488613,15.025891,21.951336,38.791795,3.477851
1,666,0,22,157,3,False,25.820405,14.121679,37.519131,6.584357,3.649633
2,188,5,29,100,3,False,23.295563,19.917428,26.673699,18.679263,19.054128
3,895,4,10,136,4,False,21.350742,17.080426,25.621059,18.753444,10.417700
4,214,1,48,181,3,True,20.835105,6.324648,35.345562,21.222962,0.599593
...,...,...,...,...,...,...,...,...,...,...,...
495,345,7,50,194,3,False,26.007491,17.563921,34.451062,49.084900,13.589877
496,227,3,14,118,1,True,23.963973,17.637955,30.289990,22.408920,19.571770
497,76,0,41,117,4,False,28.017283,18.008876,38.025690,25.687657,19.477845
498,883,7,48,63,2,True,19.871407,17.453526,22.289288,0.590704,1.020513


In [129]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [130]:
xgb_model = xgb.XGBRegressor()
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7],
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)

Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}


In [131]:
best_model.fit(X_train, y_train)

In [132]:
cols_when_model_builds = best_model.get_booster().feature_names

In [133]:
cols_when_model_builds

['event_id',
 'event_type',
 'ticket_price',
 'num_tickets_available',
 'host',
 'is_online',
 'tmp_mid',
 'tmp_min',
 'tmp_max',
 'precipitation',
 'wind_speed']

In [134]:
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

Mean Squared Error: 361.2699182840822
R-squared Score: 0.4237636831089011


In [135]:
best_model.save_model('attendance_prediction_model.xgb')

In [152]:
def predict_attendance(event_id, event_type, ticket_price, num_tickets_available,
                       host, is_online, tmp_mid, tmp_min, tmp_max, precipitation,
                       wind_speed):
    # Load the saved best model
    best_model = xgb.XGBRegressor()
    best_model.load_model('attendance_prediction_model.xgb')

    # Create a DataFrame with the input data
    input_data = pd.DataFrame({
        'event_id': [event_id],
        'event_type': [event_type],
        'ticket_price': [ticket_price],
        'num_tickets_available': [num_tickets_available],
        'host': [host],
        'is_online': [is_online],
        'tmp_mid': [tmp_mid],
        'tmp_min': [tmp_min],
        'tmp_max': [tmp_max],
        'precipitation': [precipitation],
        'wind_speed': [wind_speed],
    })
    # input_data = input_data[cols_when_model_builds]

    label_encoder = LabelEncoder()
    input_data["host"] = label_encoder.fit_transform(input_data["host"])
    input_data["event_type"] = label_encoder.fit_transform(input_data["event_type"])

    # Predict the actual attendance using the loaded model
    attendance_prediction = best_model.predict(input_data)
    if int(attendance_prediction[0]) > int(num_tickets_available):
        attendance_prediction[0] = int(num_tickets_available)

    return attendance_prediction[0]

In [153]:
creator_id = 'some_creator_id'
event_id = 12345
event_type = 'Music'
ticket_price = 10.0
num_tickets_available = 100
host = 'Host Company'
is_online = True
tmp_mid = 25.0
tmp_min = 20.0
tmp_max = 30.0
precipitation = 10
wind_speed = 10.0

attendance_prediction = predict_attendance(event_id, event_type, ticket_price, num_tickets_available,
                                           host, is_online, tmp_mid, tmp_min, tmp_max,
                                           precipitation, wind_speed)

print("Predicted Attendance:", attendance_prediction)

Predicted Attendance: 33.746403
