In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("Merged_DatasetV2.csv", dtype={'city': str, 'state': str, 'county': str, 'country': str})

# Drop unnecessary columns
#df.drop(columns=['Unnamed: 0', 'id', 'name', 'city', 'state', 'county'], inplace=True)
df.drop(columns=['Unnamed: 0', 'id', 'name', 'county'], inplace=True)

# Handle missing values (fill lifespan_years with median)
df.loc[:, 'lifespan_years'] = df['lifespan_years'].fillna(df['lifespan_years'].median())

# Split the 'services_offered' column by commas and explode the dataframe
df['services_offered'] = df['services_offered'].str.split(', ')
df = df.explode('services_offered')

# Replace NaN in 'services_offered' with 'Unknown'
df.loc[:, 'services_offered'] = df['services_offered'].fillna('Unknown')

# Impute missing values
for column in df.columns:
    if df[column].dtype in ['int64', 'float64']:
        df[column].fillna(0, inplace=True)
    elif df[column].dtype == 'object':
        df[column].fillna('Missing', inplace=True)

# Display the updated dataframe
df.head()



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna('Missing', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(0, inplace=True)


Unnamed: 0,type,city,state,country,lifespan_years,status,services_offered
0,Studio,Test Restaurant,New York,US,39.0,Active,Unknown
1,Venue,Victoria House Hotel,England,GB,39.0,Active,Unknown
2,Religious building,Genivolta,Missing,IT,359.0,Active,Unknown
3,Studio,Tenso Jinja,Missing,JP,39.0,Active,Unknown
4,Venue,Geislingen an der Steige,Missing,DE,39.0,Active,Unknown


In [2]:
# Encode target variable (services_offered)
label_encoder = LabelEncoder()
df['services_offered_encoded'] = label_encoder.fit_transform(df['services_offered'].fillna("Unknown"))

# Encode categorical features
categorical_columns = ['country', 'city', 'state']  # Adjust if needed

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # Convert to string to avoid NaN issues



# Encode categorical features
df = pd.get_dummies(df, columns=['type', 'status'], drop_first=True)

df.head()

Unnamed: 0,city,state,country,lifespan_years,services_offered,services_offered_encoded,type_Club,type_Concert hall / Theatre,type_Educational institution,type_Festival stage,type_Indoor arena,type_Missing,type_Other,type_Park,type_Pressing plant,type_Religious building,type_Stadium,type_Studio,type_Venue,status_Ended
0,16685,253,186,39.0,Unknown,0,False,False,False,False,False,False,False,False,False,False,False,True,False,False
1,17973,95,60,39.0,Unknown,0,False,False,False,False,False,False,False,False,False,False,False,False,True,False
2,5925,230,85,359.0,Unknown,0,False,False,False,False,False,False,False,False,False,True,False,False,False,False
3,16669,230,88,39.0,Unknown,0,False,False,False,False,False,False,False,False,False,False,False,True,False,False
4,5896,230,43,39.0,Unknown,0,False,False,False,False,False,False,False,False,False,False,False,False,True,False


In [3]:
# Drop unnecessary columns
X = df.drop(columns=['services_offered', 'services_offered_encoded'])  
y = df['services_offered_encoded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [4]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
#print(classification_report(y_test, target_names=label_encoder.classes_))

Accuracy: 0.7553


In [5]:
from sklearn.metrics import f1_score

# Compute Macro F1 Score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Print the result
print(f"Macro F1 Score: {macro_f1:.4f}")

Macro F1 Score: 0.0574


In [6]:
# Drop even more unnecessary columns
df.drop(columns=['city', 'state'], inplace=True)

# Drop unnecessary columns
X = df.drop(columns=['services_offered', 'services_offered_encoded'])  
y = df['services_offered_encoded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [7]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
#print(classification_report(y_test, target_names=label_encoder.classes_))

Accuracy: 0.7553


In [8]:
from sklearn.metrics import f1_score

# Compute Macro F1 Score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Print the result
print(f"Macro F1 Score: {macro_f1:.4f}")

Macro F1 Score: 0.0574


In [9]:
# Drop even more unnecessary columns
df.drop(columns=['country'], inplace=True)

# Drop unnecessary columns
X = df.drop(columns=['services_offered', 'services_offered_encoded'])  
y = df['services_offered_encoded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [10]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
#print(classification_report(y_test, target_names=label_encoder.classes_))

Accuracy: 0.7553


In [11]:
from sklearn.metrics import f1_score

# Compute Macro F1 Score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Print the result
print(f"Macro F1 Score: {macro_f1:.4f}")

Macro F1 Score: 0.0574


In [12]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      1.00      0.86     10558
           1       0.00      0.00      0.00         8
           3       0.00      0.00      0.00         9
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00        69
           7       0.00      0.00      0.00       185
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00       637
          10       0.00      0.00      0.00        54
          11       0.00      0.00      0.00        52
          12       0.00      0.00      0.00      2351
          13       0.00      0.00      0.00         5
          15       0.00      0.00      0.00        23
          16       0.00      0.00      0.00        10

    accuracy                           0.76     13979
   macro avg       0.05      0.07      0.06     13979
weighted avg       0.57   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score

# Define the macro F1 scorer
macro_f1_scorer = make_scorer(f1_score, average='macro')

# Perform cross-validation
scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring=macro_f1_scorer)

# Calculate mean and standard deviation
mean_macro_f1 = scores.mean()
std_macro_f1 = scores.std()

# Print results
print(f"Mean Macro F1 Score: {mean_macro_f1:.4f}")
print(f"Standard Deviation of Macro F1 Score: {std_macro_f1:.4f}")



Mean Macro F1 Score: 0.0545
Standard Deviation of Macro F1 Score: 0.0015
