In [2]:
# Install required dependencies
!pip install -q spacy gensim gradio scikit-learn imbalanced-learn lightgbm
!python -m spacy download en_core_web_md -q
!python -m nltk.downloader stopwords wordnet vader_lexicon -q

import pandas as pd
import numpy as np
import re
import spacy
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, make_scorer, f1_score # Import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
import joblib
import gradio as gr

# Initialize components
nlp = spacy.load("en_core_web_md")

# ================== ADVANCED TEXT PROCESSING ==================
def contextual_text_processing(text):
    """Enhanced text cleaning with negation handling"""
    # Remove social media artifacts
    text = re.sub(r'http\S+|pic\.twitter\.com/\S+|@\w+|#\w+', '', text)

    # Process with spaCy
    doc = nlp(text.lower())

    # Context-aware token processing
    processed_tokens = []
    negation = False
    for token in doc:
        if token.dep_ == 'neg':
            negation = True
            continue
        if token.is_stop or not token.is_alpha or len(token) < 3:
            continue
        lemma = token.lemma_
        if negation:
            lemma = f"NOT_{lemma}"
            negation = False
        processed_tokens.append(lemma)

    return ' '.join(processed_tokens)

# ================== STRATEGIC FEATURE ENGINEERING ==================
def create_discriminative_features(df):
    """Features designed to reduce false positives"""
    df = df.copy()

    # 1. Disaster Context Indicators
    disaster_context = {
        'response_terms': r'\b(evacuate|emergency|alert|warning|rescue|aid|shelter)\b',
        'damage_terms': r'\b(damage|destroyed|collapsed|burning|flooded|crash)\b',
        'temporal_terms': r'\b(now|immediately|urgently|today|current)\b'
    }

    for name, pattern in disaster_context.items():
        df[name] = df['text'].str.contains(pattern, flags=re.IGNORECASE).astype(int)

    # 2. Location Correlation
    df['location_match'] = df.apply(
        lambda x: int(any(ent.text.lower() in str(x['location']).lower()  # Convert x['location'] to string
            for ent in nlp(x['text']).ents if ent.label_ == 'GPE')), axis=1)

    # 3. Social Media Patterns
    df['has_media'] = df['text'].str.contains(r'pic\.twitter\.com|imgur|\[video\]', case=False).astype(int)
    df['hashtag_count'] = df['text'].str.count(r'#\w+')

    return df

# ================== OPTIMIZED MODEL PIPELINE ==================
def create_balanced_model():
    """LightGBM model with optimized feature handling"""
    preprocessor = ColumnTransformer([ # This line was line 81, changed ( to [
        ('text_tfidf', TfidfVectorizer(
            max_features=10000,
            ngram_range=(1, 2),
            analyzer='word' # Added analyzer to avoid warning
         ),
         'clean_text'), # This line was previously inside the TfidfVectorizer call
        ('numeric_features', StandardScaler(), [
            'response_terms', 'damage_terms', 'temporal_terms',
            'location_match', 'has_media', 'hashtag_count'
        ])
    ]) # This line was line 89

    return make_pipeline(
        preprocessor,
        SMOTE(sampling_strategy=0.8),
        lgb.LGBMClassifier(
            objective='binary',
            class_weight='balanced',
            n_estimators=300,
            learning_rate=0.05,
            max_depth=5,
            reg_alpha=0.1,
            reg_lambda=0.1
        )
    )

# ================== MAIN EXECUTION ==================
if __name__ == "__main__":
    # Load and preprocess data
    train_df = pd.read_csv('/content/train.csv')
    train_df['clean_text'] = train_df['text'].apply(contextual_text_processing)
    train_df = create_discriminative_features(train_df)

    # Split data
    X_train, X_val, y_train, y_val = train_test_split(
        train_df,
        train_df['target'],
        test_size=0.2,
        stratify=train_df['target'],
        random_state=42
    )

    # Create and tune model
    model = create_balanced_model()

    # Custom scoring metric
    f1_scorer = make_scorer(f1_score, average='weighted')

    # Optimized parameter grid
    param_grid = {
        'lgbmclassifier__learning_rate': [0.05, 0.1],
        'lgbmclassifier__num_leaves': [31, 63],
        'lgbmclassifier__reg_alpha': [0, 0.1]
    }

    # Stratified cross-validation
    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=StratifiedKFold(n_splits=5),
        scoring=f1_scorer,
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train, y_train)

    # Validation
    best_model = grid_search.best_estimator_
    val_preds = best_model.predict(X_val)
    print("\n🔥 Optimized Validation Report:")
    print(classification_report(y_val, val_preds))

    # Save model
    joblib.dump(best_model, 'optimized_disaster_model.pkl')

    # Gradio Interface with Confidence Threshold
    def predict_disaster(text, location):
        clean_text = contextual_text_processing(text)
        features = pd.DataFrame([[text, location, clean_text]],
                               columns=['text', 'location', 'clean_text'])
        features = create_discriminative_features(features)
        proba = best_model.predict_proba(features)[0][1]

        # Threshold adjustment
        decision = "Disaster Alert!" if proba > 0.6 else "Non-disaster"
        return {
            "Prediction": decision,
            "Confidence": f"{proba:.2%}",
            "Key Indicators": [
                "Contains emergency terms" if features['response_terms'].iloc[0] else "",
                "Damage reported" if features['damage_terms'].iloc[0] else "",
                "Location verified" if features['location_match'].iloc[0] else ""
            ]
        }

    gr.Interface(
        fn=predict_disaster,
        inputs=[gr.Textbox(label="Tweet"), gr.Textbox(label="Location")],
        outputs=gr.JSON(),
        title="Advanced Disaster Detection",
        examples=[
            ["Forest fire spreading rapidly! Evacuation orders issued", "California"],
            ["Beautiful sunset over the mountains #nature", "Colorado"]
        ]
    ).launch(share=True)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


  df[name] = df['text'].str.contains(pattern, flags=re.IGNORECASE).astype(int)
  df[name] = df['text'].str.contains(pattern, flags=re.IGNORECASE).astype(int)
  df[name] = df['text'].str.contains(pattern, flags=re.IGNORECASE).astype(int)


Fitting 5 folds for each of 8 candidates, totalling 40 fits




[LightGBM] [Info] Number of positive: 2778, number of negative: 3473
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.038592 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7525
[LightGBM] [Info] Number of data points in the train set: 6251, number of used features: 553
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000





🔥 Optimized Validation Report:
              precision    recall  f1-score   support

           0       0.79      0.87      0.83       869
           1       0.80      0.70      0.74       654

    accuracy                           0.80      1523
   macro avg       0.80      0.78      0.79      1523
weighted avg       0.80      0.80      0.79      1523

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://305d571a26c002a444.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
