In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("Merged_DatasetV2.csv", dtype={'city': str, 'state': str, 'county': str, 'country': str})
df.head()


Unnamed: 0.1,Unnamed: 0,id,type,name,city,state,county,country,lifespan_years,status,services_offered
0,0,f9a1fd95-4c6d-43a6-bb31-4ee8d5a44a8b,Studio,Columbia (CBS) Studio ‘B’,Test Restaurant,New York,,US,,Active,
1,1,42e9fa7c-a981-44f9-943c-bf39a8c6ce8d,Venue,New Theatre Oxford,Victoria House Hotel,England,Oxfordshire,GB,,Active,
2,2,34e7351d-715d-4b20-a5d6-3c0bd717eb1d,Religious building,Santuario della Beata Vergine di Ariadello,Genivolta,,,IT,359.0,Active,
3,3,0ad32768-514b-410f-9d5a-40ebb0c3373c,Studio,ROKU-st,Tenso Jinja,,,JP,,Active,
4,4,0b050655-48da-4be7-b5de-7a3ee7370b9e,Venue,Rätschenmühle,Geislingen an der Steige,,,DE,,Active,


In [4]:
# Drop rows where 'services_offered' is null
df = df.dropna(subset=['services_offered'])

# Drop unnecessary columns
df.drop(columns=['Unnamed: 0', 'id', 'name', 'county'], inplace=True)

df.head()

Unnamed: 0,type,city,state,country,lifespan_years,status,services_offered
15520,Other,Los Jarales,,VE,,Active,recorded at
15521,Venue,Plasky,Brussels Capital,BE,62.0,Active,recorded at
15522,Other,Dunluce Castle,Northern Ireland,GB,,Active,recorded at
15524,Studio,Saint Johns Eastern Orthodox Church,Tennessee,US,5.0,Ended,engineered at
15525,Venue,Shimokitazawa Eki,,JP,34.0,Active,recorded at


In [8]:
# Handle missing numerical values (fill lifespan_years with median)
df.loc[:, 'lifespan_years'] = df['lifespan_years'].fillna(df['lifespan_years'].median())

# Split the services_offered column by commas and explode the dataframe
df['services_offered'] = df['services_offered'].str.split(', ')
df = df.explode('services_offered')

# Trim any leading or trailing spaces (just in case)
df['services_offered'] = df['services_offered'].str.strip()

# Drop any empty values that might have been created
df = df[df['services_offered'].notna() & (df['services_offered'] != '')]

# Reset index after exploding
df = df.reset_index(drop=True)

# Impute missing values
for column in df.columns:
    if df[column].dtype in ['int64', 'float64']:
        df[column].fillna(0, inplace=True)
    elif df[column].dtype == 'object':
        df[column].fillna('Missing', inplace=True)

# Display the updated dataframe
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna('Missing', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(0, inplace=True)


Unnamed: 0,type,city,state,country,lifespan_years,status,services_offered
0,Other,Los Jarales,Missing,VE,42.0,Active,recorded at
1,Venue,Plasky,Brussels Capital,BE,62.0,Active,recorded at
2,Other,Dunluce Castle,Northern Ireland,GB,42.0,Active,recorded at
3,Studio,Saint Johns Eastern Orthodox Church,Tennessee,US,5.0,Ended,engineered at
4,Venue,Shimokitazawa Eki,Missing,JP,34.0,Active,recorded at


In [9]:
# Encode services_offered, the target variable
label_encoder = LabelEncoder()
df['services_offered_encoded'] = label_encoder.fit_transform(df['services_offered'].fillna("Unknown"))

# Encode categorical features
categorical_columns = ['country', 'city', 'state']

# Convert categorical columns to string to avoid NaN issues
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str)) 


# Encode categorical features with dummy columns
df = pd.get_dummies(df, columns=['type', 'status'], drop_first=True)

df.head()

Unnamed: 0,city,state,country,lifespan_years,services_offered,services_offered_encoded,type_Club,type_Concert hall / Theatre,type_Educational institution,type_Festival stage,type_Indoor arena,type_Missing,type_Other,type_Park,type_Pressing plant,type_Religious building,type_Stadium,type_Studio,type_Venue,status_Ended
0,2716,123,80,42.0,recorded at,11,False,False,False,False,False,False,True,False,False,False,False,False,False,False
1,3630,25,6,62.0,recorded at,11,False,False,False,False,False,False,False,False,False,False,False,False,True,False
2,1261,144,28,42.0,recorded at,11,False,False,False,False,False,False,True,False,False,False,False,False,False,False
3,4025,195,77,5.0,engineered at,6,False,False,False,False,False,False,False,False,False,False,False,True,False,True
4,4286,123,41,34.0,recorded at,11,False,False,False,False,False,False,False,False,False,False,False,False,True,False


In [10]:
# Drop the target column
X = df.drop(columns=['services_offered', 'services_offered_encoded'])  
y = df['services_offered_encoded']

# Split data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [12]:
# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.6732


In [14]:
from sklearn.metrics import f1_score

# Compute Macro F1 Score to compare various supervised learning models
macro_f1 = f1_score(y_test, y_pred, average='macro')

print(f"Macro F1 Score: {macro_f1:.4f}")

Macro F1 Score: 0.0545


In [15]:
## In order to try to improve accuracy, drop extraneous columns to get more specific
# Drop city and state to focus on country only for location
df.drop(columns=['city', 'state'], inplace=True)

# Drop target column
X = df.drop(columns=['services_offered', 'services_offered_encoded'])  
y = df['services_offered_encoded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [16]:
# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.6729


In [17]:
# Compute Macro F1 Score to compare various supervised learning methods
macro_f1 = f1_score(y_test, y_pred, average='macro')

print(f"Macro F1 Score: {macro_f1:.4f}")

Macro F1 Score: 0.0544


In [18]:
# Drop all location columns to focus model on venue type
df.drop(columns=['country'], inplace=True)

# Drop target column
X = df.drop(columns=['services_offered', 'services_offered_encoded'])  
y = df['services_offered_encoded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [19]:
# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.6732


In [20]:
# Compute Macro F1 Score to compare various supervised learning methods
macro_f1 = f1_score(y_test, y_pred, average='macro')

print(f"Macro F1 Score: {macro_f1:.4f}")

Macro F1 Score: 0.0542


In [21]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        14
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00        10
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00        51
           6       0.00      0.00      0.00       243
           7       0.00      0.00      0.00         2
           8       0.43      0.00      0.01       659
           9       0.00      0.00      0.00        46
          10       0.00      0.00      0.00        48
          11       0.67      1.00      0.80      2321
          12       0.00      0.00      0.00        11
          14       0.00      0.00      0.00        23
          15       0.00      0.00      0.00         9

    accuracy                           0.67      3442
   macro avg       0.07      0.07      0.05      3442
weighted avg       0.54   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score

# Define the macro F1 scorer
macro_f1_scorer = make_scorer(f1_score, average='macro')

# Perform cross-validation
scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring=macro_f1_scorer)

# Calculate mean and standard deviation
mean_macro_f1 = scores.mean()
std_macro_f1 = scores.std()

print(f"Mean Macro F1 Score: {mean_macro_f1:.4f}")
print(f"Standard Deviation of Macro F1 Score: {std_macro_f1:.4f}")



Mean Macro F1 Score: 0.0564
Standard Deviation of Macro F1 Score: 0.0019
