In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("Merged_DatasetV2.csv", dtype={'city': str, 'state': str, 'county': str, 'country': str})

# Drop unnecessary columns
df.drop(columns=['Unnamed: 0', 'id', 'name', 'city', 'state', 'county'], inplace=True)

# Handle missing values (fill lifespan_years with median)
df.loc[:, 'lifespan_years'] = df['lifespan_years'].fillna(df['lifespan_years'].median())

# Replace NaN in 'services_offered' with 'Unknown'
df.loc[:, 'services_offered'] = df['services_offered'].fillna('Unknown')

# Drop remaining rows with nulls
df.dropna(inplace=True)


In [2]:
# Encode target variable (services_offered)
label_encoder = LabelEncoder()
df['services_offered_encoded'] = label_encoder.fit_transform(df['services_offered'].fillna("Unknown"))

# Encode categorical features
df = pd.get_dummies(df, columns=['type', 'status', 'country'], drop_first=True)

In [5]:
# Drop unnecessary columns
X = df.drop(columns=['services_offered', 'services_offered_encoded'])  
y = df['services_offered_encoded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [6]:

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
#print(classification_report(y_test, target_names=label_encoder.classes_))

Accuracy: 0.7769
