In [3]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/digiculture_raw_utf8.csv.csv")
df.head()

Unnamed: 0,id,lga,town_wards,food,clothing,dance,religion,festivals,music_instruments,language,...,leadership_label,lga_label,town_wards_label,food_label,clothing_label,dance_label,festivals_label,music_instruments_label,latitude,longitude
0,1,odeda,odeda,"pounded yam, amala, gari, efo riro, assorted s...","aso-oke, agbada (men), iro & buba and gele (wo...",regberegbe/age-grade style parades for major e...,"christianity, islam, and yoruba traditional pr...",local town/farm harvest days and participation...,"talking drum, bata, sekere, local praise-singi...",yoruba,...,88,12,193,97,13,113,81,112,7.216667,3.516667
1,2,odeda,osiele,"yam, cassava (gari), stews and soups typical o...",(aso-oke/ankara) for weddings/funerals; wester...,"community drum ensembles, funeral and wedding ...","christianity, islam, and yoruba traditional pr...","local town days, harvest gatherings; ties into...",talk/drum ensembles and recorded yoruba music ...,yoruba,...,86,12,239,131,0,31,76,100,7.216667,3.516667
2,3,odeda,oluga,"agrarian diet (yams, cassava, vegetables)  ma...",yoruba ceremonial attire is used for important...,community festivals and ceremonies feature loc...,"christianity, islam and indigenous practices p...","local kingdom/town events (e.g., oluga kingdom...","local drumming, praise singers; modern recorde...",yoruba,...,22,12,229,0,74,35,69,71,7.216667,3.516667
3,4,odeda,olugbo,"typical rural yoruba staples (yam, amala, soups)",aso-oke/iro & buba at ceremonies; modern wear ...,standard local drum dances and age-grade perfo...,"churches, mosques, traditional worship",local observances and participation in regiona...,"talking drum, bata and vocal praise tradition.",yoruba,...,117,12,230,127,17,120,71,109,7.216667,3.516667
4,5,odeda,baale ogun,standard egba/yoruba staples,typical yoruba ceremonial dress under local ch...,chieftaincy installation and age-grade dances ...,"mixed (christianity, islam, traditional).","baálè installation days, market days and harve...",drumming ensembles for rites and praise-singin...,yoruba,...,0,12,61,110,64,21,6,24,7.216667,3.516667


In [4]:
def assign_cultural_focus(row):
    score = {
        "Food Heritage": len(str(row['food']).split(',')),
        "Music & Dance": len(str(row['dance']).split(',')) + len(str(row['music_instruments']).split(',')),
        "Festival & Ritual": len(str(row['festivals']).split(',')),
        "Religious Diversity": len(str(row['religion']).split(',')),
    }
    return max(score, key=score.get)

df['cultural_focus'] = df.apply(assign_cultural_focus, axis=1)


In [5]:
def advisory_action(row):
    if "festival" in str(row['festivals']).lower():
        return "Promote"
    elif "traditional" in str(row['religion']).lower():
        return "Preserve"
    else:
        return "Document"

df['advisory_action'] = df.apply(advisory_action, axis=1)


In [6]:
from sklearn.preprocessing import LabelEncoder

le_focus = LabelEncoder()
df['cultural_focus_encoded'] = le_focus.fit_transform(df['cultural_focus'])

le_action = LabelEncoder()
df['advisory_action_encoded'] = le_action.fit_transform(df['advisory_action'])


In [7]:
text_columns = [
    'food', 'clothing', 'dance', 'religion',
    'festivals', 'music_instruments'
]

df['combined_text'] = df[text_columns].fillna('').agg(' '.join, axis=1)


In [8]:
from sklearn.model_selection import train_test_split

X = df[['combined_text', 'lga_label', 'town_wards_label', 'latitude', 'longitude']]
y_focus = df['cultural_focus_encoded']
y_action = df['advisory_action_encoded']

X_train, X_test, y_focus_train, y_focus_test = train_test_split(
    X, y_focus, test_size=0.2, random_state=42, stratify=y_focus
)


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=500,
    ngram_range=(1, 2),
    stop_words='english'
)

X_text_train = tfidf.fit_transform(X_train['combined_text'])
X_text_test = tfidf.transform(X_test['combined_text'])


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=500,
    ngram_range=(1, 2),
    stop_words='english'
)

X_text_train = tfidf.fit_transform(X_train['combined_text'])
X_text_test = tfidf.transform(X_test['combined_text'])


In [11]:
from scipy.sparse import hstack

X_train_numeric = X_train[['lga_label', 'town_wards_label', 'latitude', 'longitude']].values
X_test_numeric = X_test[['lga_label', 'town_wards_label', 'latitude', 'longitude']].values

X_train_final = hstack([X_text_train, X_train_numeric])
X_test_final = hstack([X_text_test, X_test_numeric])


In [13]:
X[['lga_label', 'town_wards_label', 'latitude', 'longitude']].isna().sum()


Unnamed: 0,0
lga_label,0
town_wards_label,0
latitude,79
longitude,79


In [14]:
from sklearn.impute import SimpleImputer


In [15]:
numeric_features = ['lga_label', 'town_wards_label', 'latitude', 'longitude']

imputer = SimpleImputer(strategy='median')

X_train_numeric = imputer.fit_transform(
    X_train[numeric_features]
)

X_test_numeric = imputer.transform(
    X_test[numeric_features]
)


In [16]:
from scipy.sparse import hstack

X_train_final = hstack([X_text_train, X_train_numeric])
X_test_final = hstack([X_text_test, X_test_numeric])


In [17]:
focus_model.fit(X_train_final, y_focus_train)




In [18]:
y_focus_pred = focus_model.predict(X_test_final)

from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(y_focus_test, y_focus_pred))
print(classification_report(y_focus_test, y_focus_pred))


Accuracy: 0.7777777777777778
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.83      0.38      0.53        13
           2       0.76      0.97      0.85        32
           3       0.86      0.75      0.80         8

    accuracy                           0.78        54
   macro avg       0.61      0.53      0.54        54
weighted avg       0.78      0.78      0.75        54



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [20]:
from sklearn.linear_model import LogisticRegression

# --- Step 1: Define the model ---
action_model = LogisticRegression(max_iter=1000, n_jobs=-1)

# --- Step 2: Train the model ---
y_action_train = df.loc[X_train.index, 'advisory_action_encoded']
action_model.fit(X_train_final, y_action_train)

# --- Step 3: Evaluate the model ---
y_action_test = df.loc[X_test.index, 'advisory_action_encoded']
y_action_pred = action_model.predict(X_test_final)

from sklearn.metrics import classification_report, accuracy_score
print("Action Model Accuracy:", accuracy_score(y_action_test, y_action_pred))
print(classification_report(y_action_test, y_action_pred, zero_division=0))


Action Model Accuracy: 0.9074074074074074
              precision    recall  f1-score   support

           0       1.00      0.90      0.95        10
           1       0.86      0.60      0.71        10
           2       0.89      1.00      0.94        34

    accuracy                           0.91        54
   macro avg       0.92      0.83      0.87        54
weighted avg       0.91      0.91      0.90        54



In [21]:
import joblib

joblib.dump(focus_model, "cultural_focus_model.pkl")
joblib.dump(action_model, "advisory_action_model.pkl")  # ✅ Now exists
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
joblib.dump(imputer, "numeric_imputer.pkl")
joblib.dump(le_focus, "focus_encoder.pkl")
joblib.dump(le_action, "action_encoder.pkl")

print("All models and preprocessing objects saved successfully!")


All models and preprocessing objects saved successfully!


In [23]:
!pip install streamlit


Collecting streamlit
  Downloading streamlit-1.52.2-py3-none-any.whl.metadata (9.8 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.52.2-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.52.2


In [24]:
import streamlit as st
print("Streamlit version:", st.__version__)


Streamlit version: 1.52.2


In [26]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib
from scipy.sparse import hstack

# Load saved models & preprocessing
focus_model = joblib.load("cultural_focus_model.pkl")
action_model = joblib.load("advisory_action_model.pkl")
tfidf = joblib.load("tfidf_vectorizer.pkl")
imputer = joblib.load("numeric_imputer.pkl")
le_focus = joblib.load("focus_encoder.pkl")
le_action = joblib.load("action_encoder.pkl")

st.set_page_config(page_title="Cultural Advisory System", layout="wide")
st.title("AI-Based Cultural Advisory System")
st.write("Predict **Cultural Focus** and **Advisory Action** based on input data.")

# Input Section
st.header("Enter Cultural & Geographic Data")

text_inputs = {}
for col in ['food', 'clothing', 'dance', 'religion', 'festivals', 'music_instruments']:
    text_inputs[col] = st.text_area(f"{col.capitalize()} (comma-separated)", "")

lga_label = st.number_input("LGA Label", min_value=0)
town_wards_label = st.number_input("Town/Wards Label", min_value=0)
latitude = st.number_input("Latitude", format="%.6f")
longitude = st.number_input("Longitude", format="%.6f")

def preprocess_input(text_inputs, lga, ward, lat, lon):
    combined_text = " ".join([text_inputs[c] for c in ['food', 'clothing', 'dance', 'religion', 'festivals', 'music_instruments']])
    text_features = tfidf.transform([combined_text])
    numeric_features = np.array([[lga, ward, lat, lon]])
    numeric_features = imputer.transform(numeric_features)
    return hstack([text_features, numeric_features])

if st.button("Get Advisory"):
    input_features = preprocess_input(text_inputs, lga_label, town_wards_label, latitude, longitude)

    # Cultural Focus
    focus_pred_encoded = focus_model.predict(input_features)[0]
    focus_pred_proba = focus_model.predict_proba(input_features)[0].max()
    focus_pred = le_focus.inverse_transform([focus_pred_encoded])[0]

    # Advisory Action
    action_pred_encoded = action_model.predict(input_features)[0]
    action_pred_proba = action_model.predict_proba(input_features)[0].max()
    action_pred = le_action.inverse_transform([action_pred_encoded])[0]

    st.subheader("Prediction Results")
    st.markdown(f"**Cultural Focus:** {focus_pred} (Confidence: {focus_pred_proba:.2f})")
    st.markdown(f"**Advisory Action:** {action_pred} (Confidence: {action_pred_proba:.2f})")


Writing app.py
