In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import joblib
import re
import nltk
import os

In [2]:
# =============================================
# NLTK DATA SETUP WITH ROBUST ERROR HANDLING
# =============================================
def setup_nltk():
    # Set the NLTK data path explicitly
    nltk_data_path = os.path.join(os.path.expanduser('~'), 'nltk_data')
    if not os.path.exists(nltk_data_path):
        os.makedirs(nltk_data_path)
    
    # Set the path in NLTK
    nltk.data.path.append(nltk_data_path)
    
    # List of required NLTK packages
    required_nltk = ['punkt', 'stopwords', 'wordnet']
    
    for package in required_nltk:
        try:
            nltk.data.find(f'tokenizers/{package}' if package == 'punkt' else f'corpora/{package}')
            print(f"NLTK {package} already installed")
        except LookupError:
            print(f"Downloading NLTK {package}...")
            try:
                nltk.download(package, download_dir=nltk_data_path)
                print(f"Successfully downloaded {package}")
            except Exception as e:
                print(f"Failed to download {package}: {str(e)}")
                # Fallback: Try downloading without specifying directory
                nltk.download(package)
                print(f"Used fallback method for {package}")

# Run the setup
setup_nltk()

# Now import NLTK components
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


NLTK punkt already installed
NLTK stopwords already installed
Downloading NLTK wordnet...
Successfully downloaded wordnet


[nltk_data] Downloading package wordnet to C:\Users\USER\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
try:
    df1 = pd.read_csv('data/reddit_comments_clean.csv')
    df2 = pd.read_csv('data/self_collected_war_subreddit_comments.csv')
    df = pd.concat([df1, df2], ignore_index=True)
    print("CSV loaded successfully")
except Exception as e:
    print(f"Error loading CSV: {str(e)}")
    exit()

# Verify required columns exist
required_columns = ['text', 'label']
if not all(col in df.columns for col in required_columns):
    print(f"Missing required columns. Needed: {required_columns}, Found: {df.columns.tolist()}")
    exit()

CSV loaded successfully


In [4]:
def preprocess_text(text):
    if not isinstance(text, str) or not text.strip():
        return ""
    
    try:
        # Basic cleaning
        text = text.lower().strip()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Tokenize with fallback
        try:
            tokens = word_tokenize(text)
        except:
            tokens = text.split()  # Simple fallback
            
        # Get stopwords with fallback
        try:
            stop_words = set(stopwords.words('english'))
        except:
            stop_words = set()  # Empty set if stopwords fails
            
        # Filter stopwords
        tokens = [word for word in tokens if word not in stop_words]
        
        # Lemmatization with fallback
        try:
            lemmatizer = WordNetLemmatizer()
            tokens = [lemmatizer.lemmatize(word) for word in tokens]
        except:
            pass  # Skip lemmatization if fails
            
        return ' '.join(tokens)
    except Exception as e:
        print(f"Error processing text: {str(e)}")
        return ""

# Apply preprocessing
print("Preprocessing text...")
df['processed_text'] = df['text'].apply(preprocess_text)

Preprocessing text...


In [5]:
# Map existing labels to our target categories
label_mapping = {
    'with israel': 'pro-israel',
    'with palestine': 'pro-palestine',
    'neutral': 'neutral',
    'inquisitive': 'neutral',  # Assuming inquisitive is neutral
    'indifferent': 'neutral'   # Assuming indifferent is neutral
}
df['sentiment'] = df['label'].map(label_mapping)

# Drop rows where mapping resulted in NaN (if any unexpected labels exist)
df = df.dropna(subset=['sentiment'])

# Check class distribution
print("\nClass distribution:")
print(df['sentiment'].value_counts())


Class distribution:
sentiment
pro-israel       15309
pro-palestine    11998
neutral           8136
Name: count, dtype: int64


In [6]:
# Split data into training and testing sets
X = df['processed_text']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y  # Maintain class distribution
)

print(f"\nTraining samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")


Training samples: 28354
Test samples: 7089


In [7]:
# Create a pipeline with TF-IDF and Logistic Regression
print("\nCreating model pipeline...")
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=(1, 2), 
        max_features=5000,
        min_df=5,
        max_df=0.7
    )),
    ('clf', LogisticRegression(
        multi_class='multinomial', 
        solver='lbfgs', 
        max_iter=1000,
        class_weight='balanced'  # Handle class imbalance
    ))
])


Creating model pipeline...


In [8]:
print("Training model...")
pipeline.fit(X_train, y_train)
print("Training completed!")

Training model...




Training completed!


In [9]:
print("\nEvaluating model...")
y_pred = pipeline.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']))


Evaluating model...

Classification Report:
               precision    recall  f1-score   support

      neutral       0.50      0.57      0.53      1627
   pro-israel       0.61      0.69      0.65      3062
pro-palestine       0.56      0.41      0.48      2400

     accuracy                           0.57      7089
    macro avg       0.56      0.56      0.55      7089
 weighted avg       0.57      0.57      0.56      7089


Confusion Matrix:
Predicted      neutral  pro-israel  pro-palestine
Actual                                           
neutral            935         381            311
pro-israel         486        2100            476
pro-palestine      449         956            995


In [10]:
print("\nSaving model...")
joblib.dump(pipeline, 'israel_palestine_sentiment_model.pkl')
print("Model saved as 'israel_palestine_sentiment_model.pkl'")


Saving model...
Model saved as 'israel_palestine_sentiment_model.pkl'


In [11]:
def predict_sentiment(text, model=None):
    """Predict sentiment of new text with optional model parameter"""
    if model is None:
        try:
            model = joblib.load('israel_palestine_sentiment_model.pkl')
        except:
            print("Error loading model. Using pipeline from memory.")
            model = pipeline
    
    processed_text = preprocess_text(text)
    if not processed_text.strip():
        return "neutral"  # Default for empty text
    
    try:
        return model.predict([processed_text])[0]
    except:
        return "neutral"  # Fallback prediction

In [12]:
model = joblib.load('israel_palestine_sentiment_model.pkl')

In [13]:
from sklearn.metrics import accuracy_score

pro_israel_texts = [
    "I stand with Israel in their right to defend themselves",
    "The IDF is protecting their citizens",
    "Israel has the right to exist as a Jewish state",
    "Israel has the right to defend its borders.",
    "Hamas is a terrorist organization targeting civilians.",
    "Supporting Israel is supporting democracy in the Middle East.",
    "The Iron Dome saves countless Israeli lives.",
    "Criticism of Israel often masks antisemitism.",
    "Israeli citizens live under constant rocket threat.",
    "The Jewish people have a historical right to this land.",
    "IDF operations aim to eliminate terrorist threats.",
    "Israel withdrew from Gaza, yet rockets still fly.",
    "The UN is biased against Israel in its resolutions."
]

# --- Pro-Palestine Texts ---
pro_palestine_texts = [
    "Free Palestine from occupation",
    "Palestinians deserve equal rights and freedom",
    "End the occupation of Palestinian territories",
    "Israel's blockade has devastated Gaza's economy.",
    "The occupation must end for peace to begin.",
    "Palestinian families are being evicted from their homes.",
    "The West Bank is under illegal military control.",
    "Free Gaza from siege and suffering.",
    "Palestinian children deserve safety and education.",
    "The wall separates families and stifles lives.",
    "The Nakba is an ongoing tragedy for Palestinians.",
    "Settlements violate international law.",
    "We must stand against apartheid policies."
]

# --- Neutral Texts ---
neutral_texts = [
    "This is a neutral comment about the situation",
    "The conflict is complex with valid arguments on both sides",
    "Both sides have suffered greatly in this conflict.",
    "Dialogue and understanding are essential for peace.",
    "The conflict has a long and complex history.",
    "Civilians on both sides deserve protection.",
    "International law should guide the resolution process.",
    "It's important to listen to all voices in this debate.",
    "War affects everyone, not just combatants.",
    "Social media often simplifies complex issues.",
    "Peace will require compromise from both parties.",
    "We must seek truth before taking sides."
]

sample_texts = pro_israel_texts + pro_palestine_texts + neutral_texts
true_labels = (
    ["pro-israel"] * len(pro_israel_texts)
    + ["pro-palestine"] * len(pro_palestine_texts)
    + ["neutral"] * len(neutral_texts)
)

# --- Predict and evaluate ---
predicted_labels = []

print("\nSample Predictions:")
for text in sample_texts:
    prediction = predict_sentiment(text)
    predicted_labels.append(prediction)
    print(f"Text: '{text[:50]}...' -> {prediction}")

# --- Accuracy ---
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"\nAccuracy: {accuracy * 100:.2f}%")


Sample Predictions:
Text: 'I stand with Israel in their right to defend thems...' -> pro-israel
Text: 'The IDF is protecting their citizens...' -> pro-palestine
Text: 'Israel has the right to exist as a Jewish state...' -> pro-israel
Text: 'Israel has the right to defend its borders....' -> pro-israel
Text: 'Hamas is a terrorist organization targeting civili...' -> pro-israel
Text: 'Supporting Israel is supporting democracy in the M...' -> pro-israel
Text: 'The Iron Dome saves countless Israeli lives....' -> pro-israel
Text: 'Criticism of Israel often masks antisemitism....' -> pro-palestine
Text: 'Israeli citizens live under constant rocket threat...' -> pro-israel
Text: 'The Jewish people have a historical right to this ...' -> pro-palestine
Text: 'IDF operations aim to eliminate terrorist threats....' -> pro-israel
Text: 'Israel withdrew from Gaza, yet rockets still fly....' -> pro-israel
Text: 'The UN is biased against Israel in its resolutions...' -> pro-israel
Text: 'Free Palest

In [14]:
test_case = ""
print(f"Text: '{test_case[:50]}...' -> {predict_sentiment(test_case)}")

Text: '...' -> neutral
