### Research questions
1. prediction problem (classification) and feature weights - simply predict the next sentiment based on generated features, and analyse which feature contributes the most. examples: user-id (same thing) date (time), text (key words).
2. incoperate with LLM to give explanations of why the text is classified as given sentiment.
3. efficient forecasting over large datasets, create a basic model, and compared two ways of processing data. 1, deploy locally and use naive python packages. 2, utilize

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GroupKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import multiprocessing
import time
import warnings

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michelletong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michelletong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/michelletong/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
def load_and_preprocess_data(file_path):
    """
    Load and preprocess the dataset
    """
    # Load the dataset
    df = pd.read_csv(file_path, encoding='latin-1', header=None)
    df.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
    
    # Convert sentiment to binary (0: negative, 1: positive)
    # Assuming sentiment values are 0 and 4 in the original dataset
    df['sentiment'] = df['sentiment'].map({0: 0, 4: 1})
    
    # Convert date to datetime
    df['date'] = pd.to_datetime(df['date'], format='%a %b %d %H:%M:%S PDT %Y')
    
    # Extract basic features from text
    df['text_length'] = df['text'].str.len()
    df['word_count'] = df['text'].str.split().str.len()
    df['hashtag_count'] = df['text'].str.count(r'#')
    df['mention_count'] = df['text'].str.count(r'@')
    df['url_count'] = df['text'].str.count(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\\\\\\\(\\\\\\\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    
    # Extract time-based features
    df['hour'] = df['date'].dt.hour
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    
    return df

def clean_text(text):
    """
    Clean the text data by removing @mentions, URLs, hashtags, punctuation
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs - more comprehensive pattern
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove @mentions - more comprehensive pattern
    text = re.sub(r'@[\w_]+', '', text)
    
    # Remove hashtags (but keep the text after #)
    text = re.sub(r'#(\w+)', r'\1', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove digits (optional - uncomment if needed)
    # text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace (including newlines)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


In [3]:
def debug_text_cleaning(text):
    """
    Function to debug text cleaning process
    """
    print("Original:", text)
    
    # Test URL removal
    text_no_urls = re.sub(r'https?://\S+|www\.\S+', '', text)
    print("After URL removal:", text_no_urls)
    
    # Test @mention removal
    text_no_mentions = re.sub(r'@[\w_]+', '', text_no_urls)
    print("After @mention removal:", text_no_mentions)
    
    # Test hashtag conversion
    text_no_hashtags = re.sub(r'#(\w+)', r'\1', text_no_mentions)
    print("After hashtag conversion:", text_no_hashtags)
    
    # Test punctuation removal
    text_no_punct = re.sub(r'[^\w\s]', '', text_no_hashtags)
    print("After punctuation removal:", text_no_punct)
    
    # Test whitespace cleaning
    text_clean = re.sub(r'\s+', ' ', text_no_punct).strip()
    print("Final cleaned:", text_clean)
    
    return text_clean


In [4]:
def create_visualizations(df):
    # 1. Sentiment Distribution
    plt.figure(figsize=(8, 6))
    sns.countplot(x='sentiment', data=df)
    plt.title('Distribution of Sentiments')
    plt.xlabel('Sentiment')
    plt.ylabel('Count')
    plt.xticks([0, 1], ['Negative', 'Positive'])
    plt.savefig('fig/sentiment_distribution.png')
    plt.close()
    
    # 2. Text Length Distribution by Sentiment
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='sentiment', y='text_length', data=df)
    plt.title('Text Length Distribution by Sentiment')
    plt.xlabel('Sentiment')
    plt.ylabel('Text Length')
    plt.xticks([0, 1], ['Negative', 'Positive'])
    plt.savefig('fig/text_length_distribution.png')
    plt.close()
    
    # 3. Time-based Analysis
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    # Hourly distribution
    sns.countplot(x='hour', hue='sentiment', data=df, ax=axes[0])
    axes[0].set_title('Tweets by Hour of Day')
    axes[0].set_xlabel('Hour')
    axes[0].set_ylabel('Count')
    axes[0].legend(title='Sentiment', labels=['Negative', 'Positive'])
    
    # Day of week distribution
    sns.countplot(x='day_of_week', hue='sentiment', data=df, ax=axes[1])
    axes[1].set_title('Tweets by Day of Week')
    axes[1].set_xlabel('Day of Week (0=Monday)')
    axes[1].set_ylabel('Count')
    axes[1].legend(title='Sentiment', labels=['Negative', 'Positive'])
    
    # Monthly distribution
    sns.countplot(x='month', hue='sentiment', data=df, ax=axes[2])
    axes[2].set_title('Tweets by Month')
    axes[2].set_xlabel('Month')
    axes[2].set_ylabel('Count')
    axes[2].legend(title='Sentiment', labels=['Negative', 'Positive'])
    
    plt.tight_layout()
    plt.savefig('fig/time_based_analysis.png')
    plt.close()
    
    # 4. Feature Correlation Analysis
    plt.figure(figsize=(10, 8))
    correlation_matrix = df[['sentiment', 'text_length', 'word_count', 'hashtag_count', 'mention_count', 'url_count']].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Feature Correlation Matrix')
    plt.savefig('fig/feature_correlation.png')
    plt.close()
    
    # 5. Word Clouds for Positive and Negative Tweets
    def generate_wordcloud(text, title, filename):
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(title)
        plt.savefig(filename)
        plt.close()
    
    # Generate word clouds for positive and negative tweets
    positive_text = ' '.join(df[df['sentiment'] == 1]['text'])
    negative_text = ' '.join(df[df['sentiment'] == 0]['text'])
    
    generate_wordcloud(positive_text, 'Word Cloud for Positive Tweets', 'positive_wordcloud.png')
    generate_wordcloud(negative_text, 'Word Cloud for Negative Tweets', 'negative_wordcloud.png')


In [39]:
def engineer_features(df):
    """
    Engineer features based on the project outline
    """
    # 1. User-based Features
    
    # Group by user and calculate statistics
    user_stats = df.groupby('user')['sentiment'].agg(['mean', 'count', 'std']).reset_index()
    
    # Calculate correct std with n-1 denominator
    def adjusted_std(group):
        if len(group) <= 1:
            return 0
        return np.std(group, ddof=1)  # ddof=1 uses n-1 denominator
    
    user_sentiment_std = df.groupby('user')['sentiment'].apply(adjusted_std)
    user_stats['std'] = user_stats['user'].map(user_sentiment_std)
    
    # Handle case where a user has only one tweet (std is NaN)
    user_stats['std'] = user_stats['std'].fillna(0)
    
    user_stats.columns = ['user', 'user_avg_sentiment', 'user_tweet_count', 'user_sentiment_std']
    
    # Merge user stats back to main dataframe
    df = pd.merge(df, user_stats, on='user', how='left')
    
    # Calculate average posting gap time for each user
    df = df.sort_values(['user', 'date'])
    
    # Function to calculate average time between posts
    def calc_avg_gap(group):
        if len(group) <= 1:
            return pd.Timedelta(0)
        gaps = group['date'].diff().dropna()
        return gaps.mean()
    
    # Calculate average gap for each user
    avg_gaps = df.groupby('user').apply(calc_avg_gap)
    avg_gaps_seconds = avg_gaps.dt.total_seconds()
    avg_gaps_df = pd.DataFrame({
        'user': avg_gaps.index, 
        'avg_posting_gap_seconds': avg_gaps_seconds.values
    })
    
    # Merge gaps back to main dataframe
    df = pd.merge(df, avg_gaps_df, on='user', how='left')
    df['avg_posting_gap_seconds'] = df['avg_posting_gap_seconds'].fillna(0)
    
    # 2. Text Processing Features
    
    # Apply text cleaning
    df['clean_text'] = df['text'].apply(clean_text)
    
    # Create tokenized text for Word2Vec
    df['tokens'] = df['clean_text'].apply(word_tokenize)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    df['tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])
    
    # These will be one-hot encoded later
    df['hour_cat'] = df['hour'].astype('category')
    df['day_of_week_cat'] = df['day_of_week'].astype('category') 
    df['month_cat'] = df['month'].astype('category')

    return df

def prepare_features(df, w2v_df):
    """
    Prepare features with proper preprocessing
    """
    # Identify categorical and numerical features
    categorical_features = ['hour_cat', 'day_of_week_cat', 'month_cat']
    numerical_features = ['text_length', 'word_count', 'hashtag_count', 
                         'mention_count', 'url_count', 'user_tweet_count', 
                         'user_sentiment_std', 'avg_posting_gap_seconds']
    
    # Create column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(drop='first'), categorical_features)
        ])
    
    # Extract features from DataFrame
    X = df[numerical_features + categorical_features]
    
    # Preprocess the features
    X_preprocessed = preprocessor.fit_transform(X)
    
    # Combine with Word2Vec features (which may need their own scaling)
    w2v_scaled = StandardScaler().fit_transform(w2v_df)
    
    # Convert to sparse matrices if needed for efficiency
    from scipy import sparse
    if sparse.issparse(X_preprocessed):
        X_combined = sparse.hstack([X_preprocessed, w2v_scaled])
    else:
        X_combined = np.hstack([X_preprocessed, w2v_scaled])
    
    return X_combined, preprocessor


def extract_word2vec_features(df, vector_size=50, min_count=2):
    """
    Extract Word2Vec features
    """
    # Train Word2Vec model
    all_tokens = df['tokens'].tolist()

      # Train with more context window and more training iterations
    w2v_model = Word2Vec(
        sentences=all_tokens,
        vector_size=vector_size,
        window=8,          # Larger context window
        min_count=min_count, # Ignore rare words
        workers=4,
        sg=1,              # Use skip-gram
        epochs=20          # More training iterations
    )
    
    # Function to get document vectors by averaging word vectors
    def get_doc_vector(tokens):
        vec = np.zeros(vector_size)
        count = 0
        for word in tokens:
            try:
                vec += w2v_model.wv[word]
                count += 1
            except KeyError:
                # Word not in vocabulary
                continue
        if count > 0:
            vec /= count
        return vec
    
    # Get document vectors
    
    doc_vectors = np.array(df['tokens'].apply(get_doc_vector).tolist())
    w2v_df = pd.DataFrame(
        doc_vectors,
        columns=[f'w2v_{i}' for i in range(vector_size)]
    )
    # doc_vectors = df['tokens'].apply(get_doc_vector)
    # w2v_df = pd.DataFrame(doc_vectors, columns=['word2vec_embedding'])

    return w2v_df, w2v_model

def select_features(features_df, n_components=20):
    """
    Perform PCA for feature selection
    """
    # Initialize PCA
    pca = PCA(n_components=n_components)
    
    # Fit and transform
    pca_features = pca.fit_transform(features_df)
    
    # Convert to DataFrame
    pca_df = pd.DataFrame(
        pca_features, 
        columns=[f'pca_{i}' for i in range(n_components)]
    )
    
    # Calculate explained variance ratio
    explained_variance = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance)
    
    # Print variance explanation
    print(f"Top 10 components explain {cumulative_variance[9]:.2%} of variance")
    print(f"All {n_components} components explain {cumulative_variance[-1]:.2%} of variance")
    
    return pca_df, pca

def train_evaluate_models(X_train, X_test, y_train, y_test):
    """
    Train and evaluate classification models
    """
    # Define base models
    # Best SVM parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
    # Best Random Forest parameters: {'max_depth': 20, 'n_estimators': 200}
    # Best XGBoost parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}

    svm = SVC(probability=False, kernel='rbf', random_state=42, C=10, gamma='auto')
    rf = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=20)
    xgb = GradientBoostingClassifier(n_estimators=200, random_state=42, learning_rate=0.1, max_depth=3) 
    
    # Train individual models
    models = {
        'SVM': svm,
        'Random Forest': rf,
        'XGBoost': xgb
    }
    
    results = {}
    
    for name, model in models.items():
        print(f"Training {name}...")
        t0 = time.time()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        class_report = classification_report(y_test, y_pred)
        
        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'confusion_matrix': conf_matrix,
            'classification_report': class_report
        }
        
        print(f"Training {name} took {time.time() - t0:.2f} seconds")
        print(f"{name} Accuracy: {accuracy:.4f}")
        print(f"Confusion Matrix:\n{conf_matrix}")
        print(f"Classification Report:\n{class_report}")
        print("="*50)
    
    # Create Voting Ensemble (majority voting)
    voting_clf = VotingClassifier(
        estimators=[('svm', svm), ('rf', rf), ('xgb', xgb)],
        voting='hard'  # Majority voting
    )
    
    print("Training Ensemble (Majority Voting)...")
    t0 = time.time()
    voting_clf.fit(X_train, y_train)
    y_pred = voting_clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    results['Ensemble'] = {
        'model': voting_clf,
        'accuracy': accuracy,
        'confusion_matrix': conf_matrix,
        'classification_report': class_report
    }
    
    print("Training Ensemble took %0.2f seconds" % (time.time() - t0))
    print(f"Ensemble Accuracy: {accuracy:.4f}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Classification Report:\n{class_report}")
    
    return results

def perform_cross_validation(X, y, df):
    """
    Perform time-based and user-based cross-validation
    """
    # Time-based Cross Validation
    print("Performing Time-based Cross Validation")
    t0 = time.time()
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Define base models, use voting ensemble for cross validation
    svm = SVC(probability=False, kernel='rbf', random_state=42, C=10, gamma='auto')
    rf = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=20)
    xgb = GradientBoostingClassifier(n_estimators=200, random_state=42, learning_rate=0.1, max_depth=3) 
    # Create Voting Ensemble (majority voting)
    voting_clf = VotingClassifier(
        estimators=[('svm', svm), ('rf', rf), ('xgb', xgb)],
        voting='hard'  # Majority voting
    )
    
    time_scores = []
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        voting_clf.fit(X_train, y_train)
        y_pred = voting_clf.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        time_scores.append(score)
    
    print(f"Time-based CV took {time.time() - t0:.2f} seconds")
    print(f"Time-based CV Scores: {time_scores}")
    print(f"Mean Time-based CV Score: {np.mean(time_scores):.4f}")
    
    # User-based Cross Validation
    print("\nPerforming User-based Cross Validation")
    t0 = time.time()
    user_groups = df['user'].astype('category').cat.codes.values
    gkf = GroupKFold(n_splits=5)
    
    user_scores = []
    for train_index, test_index in gkf.split(X, y, groups=user_groups):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        voting_clf.fit(X_train, y_train)
        y_pred = voting_clf.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        user_scores.append(score)
    
    print(f"User-based CV took {time.time() - t0:.2f} seconds")
    print(f"User-based CV Scores: {user_scores}")
    print(f"Mean User-based CV Score: {np.mean(user_scores):.4f}")
    
    return time_scores, user_scores

def analyze_misclassified_examples(df, X_test, y_test, model, idx_test):
    """
    Analyze misclassified examples
    """
    y_pred = model.predict(X_test)
    misclassified_idx = idx_test[y_pred != y_test]
    
    misclassified_df = df.iloc[misclassified_idx].copy()
    misclassified_df['predicted_sentiment'] = y_pred[y_pred != y_test]
    
    print(f"Number of misclassified examples: {len(misclassified_df)}")
    
    # Analyze by features
    print("\nMisclassification Analysis by Features:")
    
    # By text length
    print("\nBy Text Length:")
    bins = [0, 50, 100, 150, 200, np.inf]
    labels = ['Very Short', 'Short', 'Medium', 'Long', 'Very Long']
    misclassified_df['text_length_bin'] = pd.cut(misclassified_df['text_length'], bins=bins, labels=labels)
    print(misclassified_df['text_length_bin'].value_counts(normalize=True).sort_index())
    
    # By user tweet count
    print("\nBy User Tweet Count:")
    bins = [0, 5, 10, 20, 50, np.inf]
    labels = ['Very Few', 'Few', 'Average', 'Many', 'Very Many']
    misclassified_df['user_tweet_count_bin'] = pd.cut(misclassified_df['user_tweet_count'], bins=bins, labels=labels)
    print(misclassified_df['user_tweet_count_bin'].value_counts(normalize=True).sort_index())
    
    # By time of day
    print("\nBy Hour of Day:")
    hour_bins = [0, 6, 12, 18, 24]
    hour_labels = ['Night', 'Morning', 'Afternoon', 'Evening']
    misclassified_df['hour_bin'] = pd.cut(misclassified_df['hour'], bins=hour_bins, labels=hour_labels)
    print(misclassified_df['hour_bin'].value_counts(normalize=True).sort_index())
    
    # Sample of misclassified examples
    print("\nSample of Misclassified Examples:")
    sample = misclassified_df.sample(min(5, len(misclassified_df)))
    for _, row in sample.iterrows():
        print(f"Text: {row['text']}")
        print(f"True Sentiment: {row['sentiment']}, Predicted: {row['predicted_sentiment']}")
        print("-" * 50)
    
    return misclassified_df

def visualize_results(df, results, pca, feature_names):
    """
    Create visualizations for the analysis
    """
    # 1. PCA Explained Variance
    plt.figure(figsize=(10, 6))
    plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_)
    plt.xlabel('Principal Component')
    plt.ylabel('Explained Variance Ratio')
    plt.title('Explained Variance by Principal Component')
    plt.tight_layout()
    plt.savefig('fig/pca_variance.png')
    plt.close()
    
    # 2. Feature Importance from PCA loadings
    plt.figure(figsize=(12, 8))
    # Get most important features from first component
    component = 0
    loadings = pd.Series(abs(pca.components_[component]), index=feature_names)
    top_features = loadings.nlargest(15)
    
    sns.barplot(x=top_features.values, y=top_features.index)
    plt.title(f'Top 15 Feature Importances (PC {component+1})')
    plt.xlabel('Absolute Loading Value')
    plt.tight_layout()
    plt.savefig('fig/feature_importance.png')
    plt.close()
    
    # 3. Sentiment Distribution by Time of Day
    plt.figure(figsize=(10, 6))
    hour_counts = df.groupby(['hour', 'sentiment']).size().unstack()
    hour_counts.plot(kind='bar', stacked=True)
    plt.title('Sentiment Distribution by Hour of Day')
    plt.xlabel('Hour')
    plt.ylabel('Count')
    plt.legend(['Negative', 'Positive'])
    plt.tight_layout()
    plt.savefig('fig/sentiment_by_hour.png')
    plt.close()
    
    # 4. Sentiment Distribution by Day of Week
    plt.figure(figsize=(10, 6))
    day_counts = df.groupby(['day_of_week', 'sentiment']).size().unstack()
    day_counts.plot(kind='bar', stacked=True)
    plt.title('Sentiment Distribution by Day of Week')
    plt.xlabel('Day of Week (0=Monday)')
    plt.ylabel('Count')
    plt.legend(['Negative', 'Positive'])
    plt.tight_layout()
    plt.savefig('fig/sentiment_by_day.png')
    plt.close()
    
    # 5. User Sentiment Patterns (Top 10 users by tweet count)
    top_users = df['user'].value_counts().head(10).index
    user_df = df[df['user'].isin(top_users)]
    
    plt.figure(figsize=(12, 8))
    user_sentiment = user_df.groupby('user')['sentiment'].mean().sort_values()
    sns.barplot(x=user_sentiment.index, y=user_sentiment.values)
    plt.title('Average Sentiment for Top 10 Users')
    plt.xticks(rotation=45)
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.savefig('fig/user_sentiment.png')
    plt.close()
    
    # 6. Word Clouds by Sentiment
    for sentiment, label in [(0, 'Negative'), (1, 'Positive')]:
        text = ' '.join(df[df['sentiment'] == sentiment]['clean_text'])
        
        wordcloud = WordCloud(
            width=800, height=400,
            background_color='white',
            max_words=200
        ).generate(text)
        
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'Word Cloud for {label} Sentiment')
        plt.tight_layout()
        plt.savefig(f'fig/wordcloud_sentiment_{sentiment}.png')
        plt.close()
    
    # 7. Model Comparison
    accuracies = {name: info['accuracy'] for name, info in results.items()}
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()))
    plt.title('Model Accuracy Comparison')
    plt.xlabel('Model')
    plt.ylabel('Accuracy')
    plt.ylim(0, 1)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('fig/model_comparison.png')
    plt.close()
    
    # 8. Confusion Matrix Visualization
    for name, info in results.items():
        plt.figure(figsize=(8, 6))
        cm = info['confusion_matrix']
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                   xticklabels=['Negative', 'Positive'],
                   yticklabels=['Negative', 'Positive'])
        plt.title(f'Confusion Matrix - {name}')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.tight_layout()
        plt.savefig(f'fig/confusion_matrix_{name}.png')
        plt.close()

def compare_processing_methods(df, test_size=1000):
    """
    Compare local vs distributed processing performance
    """
    # Subset data for testing
    test_df = df.sample(test_size, random_state=42)
    
    # 1. Local Python Implementation
    print("Testing Local Python Implementation...")
    start_time = time.time()
    
    # Simulate local processing
    tokens = test_df['text'].apply(clean_text).apply(word_tokenize).tolist()
    local_time = time.time() - start_time
    print(f"Local processing time: {local_time:.2f} seconds")
    
    # 2. Simulated Distributed Processing
    try:
        print("\nTesting Parallel Processing Implementation...")
        start_time = time.time()
        
        # Determine number of cores
        num_cores = multiprocessing.cpu_count()
        print(f"Using {num_cores} cores")
        
        # Split data into chunks
        chunks = np.array_split(test_df['text'], num_cores)
        
        # Define processing function
        def process_chunk(chunk):
            return [word_tokenize(clean_text(text)) for text in chunk]
        
        # Create a pool and process in parallel
        with multiprocessing.Pool(num_cores) as pool:
            results = pool.map(process_chunk, chunks)
            
        # Flatten results
        parallel_tokens = [item for sublist in results for item in sublist]
        
        parallel_time = time.time() - start_time
        print(f"Parallel processing time: {parallel_time:.2f} seconds")
        print(f"Speedup: {local_time / parallel_time:.2f}x")
        
    except Exception as e:
        print(f"Error in parallel processing: {e}")
        print("Please set up a proper distributed environment for actual testing")
        parallel_time = None
    
    return {'local_time': local_time, 'parallel_time': parallel_time}


In [6]:
def prepare_features_with_selective_pca(df, w2v_df, num_pca_components=10):
    """
    Apply PCA selectively to numerical features only, then combine with
    categorical features and Word2Vec embeddings
    """
    # Define feature groups
    numerical_features = ['word_count', 'hashtag_count', 
                         'mention_count', 'url_count', 'user_tweet_count', 
                         'user_sentiment_std', 'avg_posting_gap_seconds']
    
    categorical_features = ['hour', 'day_of_week', 'month']
    
    # 1. Extract and standardize numerical features
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    numerical_scaled = scaler.fit_transform(df[numerical_features])
    
    # 2. Apply PCA to numerical features only
    from sklearn.decomposition import PCA
    pca = PCA(n_components=min(num_pca_components, len(numerical_features)))
    numerical_pca = pca.fit_transform(numerical_scaled)
    
    # Print variance explained by numerical PCA
    explained_variance = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance)
    print(f"Numerical PCA: {pca.n_components_} components explain {cumulative_variance[-1]:.2%} of variance")
    
    # 3. One-hot encode categorical features
    from sklearn.preprocessing import OneHotEncoder
    encoder = OneHotEncoder(drop='first', sparse_output=False)
    categorical_encoded = encoder.fit_transform(df[categorical_features])
    
    # 4. Combine all features: numerical_pca + categorical_encoded + w2v_df
    
    # Convert to numeric arrays if needed
    numerical_pca_array = numerical_pca
    w2v_array = w2v_df.values
    
    # Combine all features
    combined_features = np.hstack([
        numerical_pca_array,   # PCA-reduced numerical features
        categorical_encoded,   # One-hot encoded categorical features
        w2v_array              # Word2Vec embeddings
    ])
    
    # Create feature names for interpretability
    feature_names = (
        [f'num_pca_{i}' for i in range(pca.n_components_)] +
        [f'{feat}_{cat}' for feat, cats in zip(encoder.feature_names_in_, 
                                              encoder.categories_) 
                          for cat in cats[1:]] +
        list(w2v_df.columns)
    )
    
    # Return as DataFrame for convenience
    combined_df = pd.DataFrame(combined_features, columns=feature_names)
    
    return combined_df, pca, encoder


def analyze_pca_components(pca, feature_names, n_components=4, n_top_features=4):
    """
    Analyze PCA components and print the top features for each component
    
    Parameters:
    -----------
    pca : PCA
        Fitted PCA model
    feature_names : list
        Names of the features used in PCA
    n_components : int
        Number of principal components to analyze
    n_top_features : int
        Number of top features to display for each component
    """
    # Check if we have fewer components than requested
    n_components = min(n_components, pca.n_components_)
    
    print(f"\nTop {n_top_features} features for each of the first {n_components} principal components:")
    
    # For each component
    for i in range(n_components):
        # Get loadings (weights) for this component
        loadings = pca.components_[i]
        
        # Get indices of top features (highest absolute loadings)
        top_indices = np.argsort(np.abs(loadings))[-n_top_features:]
        
        # Reverse to get highest first
        top_indices = top_indices[::-1]
        
        # Print component number and variance explained
        variance = pca.explained_variance_ratio_[i]
        print(f"\nComponent {i+1} (explains {variance:.2%} of variance):")
        
        # Print top features
        for idx in top_indices:
            # Get feature name and loading
            feature = feature_names[idx]
            loading = loadings[idx]
            sign = "+" if loading > 0 else "-"
            
            # Print feature and its loading
            print(f"  {sign} {feature}: {abs(loading):.4f}")
    
    # Print cumulative variance
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    print(f"\nCumulative variance explained by these {n_components} components: {cumulative_variance[n_components-1]:.2%}")
    
    return

In [7]:
print("Loading and preprocessing data...")
df_all = load_and_preprocess_data('sentiment140.csv')
df_all.head()

Loading and preprocessing data...


Unnamed: 0,sentiment,id,date,query,user,text,text_length,word_count,hashtag_count,mention_count,url_count,hour,day_of_week,month
0,0,1467810369,2009-04-06 22:19:45,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",115,19,0,1,1,22,0,4
1,0,1467810672,2009-04-06 22:19:49,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,111,21,0,0,0,22,0,4
2,0,1467810917,2009-04-06 22:19:53,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,89,18,0,1,0,22,0,4
3,0,1467811184,2009-04-06 22:19:57,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,47,10,0,0,0,22,0,4
4,0,1467811193,2009-04-06 22:19:57,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",111,21,0,1,0,22,0,4


In [54]:
df = df_all.sample(50000, replace=False, random_state=42)
print("\nBasic statistics:")
print(f"Dataset shape: {df.shape}")
print(f"Sentiment distribution: {df['sentiment'].value_counts(normalize=True)}")

# After feature engineering
df = engineer_features(df)

# Extract Word2Vec features normally (without PCA)
print("\nExtracting Word2Vec features...")
w2v_df, w2v_model = extract_word2vec_features(df, vector_size=25)  # Use smaller vector size

# Apply selective PCA approach
print("\nPreparing features with selective PCA...")
features_df, numerical_pca, categorical_encoder = prepare_features_with_selective_pca(
    df, w2v_df, num_pca_components=4
)

# Define feature groups
numerical_features = ['word_count', 'hashtag_count', 
                        'mention_count', 'url_count', 'user_tweet_count', 
                        'user_sentiment_std', 'avg_posting_gap_seconds']

# Analyze the PCA components
analyze_pca_components(
    pca=numerical_pca,
    feature_names=df[numerical_features].columns,  # Original feature names
    n_components=4,  # Show top 4 components
    n_top_features=4  # Show top 4 features per component
)



Basic statistics:
Dataset shape: (50000, 14)
Sentiment distribution: sentiment
1    0.50028
0    0.49972
Name: proportion, dtype: float64

Extracting Word2Vec features...

Preparing features with selective PCA...
Numerical PCA: 4 components explain 70.51% of variance

Top 4 features for each of the first 4 principal components:

Component 1 (explains 26.75% of variance):
  + user_sentiment_std: 0.5918
  + user_tweet_count: 0.5613
  + avg_posting_gap_seconds: 0.5430
  + mention_count: 0.1921

Component 2 (explains 15.47% of variance):
  + word_count: 0.6287
  + mention_count: 0.5913
  - url_count: 0.4509
  + hashtag_count: 0.1565

Component 3 (explains 14.69% of variance):
  + hashtag_count: 0.7655
  + url_count: 0.5845
  + word_count: 0.2657
  - mention_count: 0.0410

Component 4 (explains 13.61% of variance):
  + hashtag_count: 0.6222
  - url_count: 0.5818
  - word_count: 0.5201
  - mention_count: 0.0503

Cumulative variance explained by these 4 components: 70.51%


In [55]:
# Split the data
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    features_df, df['sentiment'], df.index, test_size=0.2, random_state=42, stratify=df['sentiment']
)

# Train models
results = train_evaluate_models(X_train, X_test, y_train, y_test)

Training SVM...
Training SVM took 127.90 seconds
SVM Accuracy: 0.7451
Confusion Matrix:
[[3715 1282]
 [1267 3736]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.74      0.74      4997
           1       0.74      0.75      0.75      5003

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000

Training Random Forest...
Training Random Forest took 39.03 seconds
Random Forest Accuracy: 0.7306
Confusion Matrix:
[[3623 1374]
 [1320 3683]]
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.73      0.73      4997
           1       0.73      0.74      0.73      5003

    accuracy                           0.73     10000
   macro avg       0.73      0.73      0.73     10000
weighted avg       0.73      0.73      0.73     10000

Training XGBoost...
Training XGBoost took

In [33]:
# from sklearn.model_selection import GridSearchCV

# # SVM - Focus on kernel, C, and gamma (most impactful for SVM)
# svm_param_grid = {
#     'C': [0.1, 1, 10],
#     'kernel': ['linear', 'rbf'],  # Removed 'poly' to simplify
#     'gamma': ['scale', 'auto']
# }

# svm_grid = GridSearchCV(SVC(random_state=42), svm_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# svm_grid.fit(X_train, y_train)
# print(f"Best SVM parameters: {svm_grid.best_params_}")
# best_svm = svm_grid.best_estimator_

# # Random Forest - Focus on n_estimators and max_depth
# rf_param_grid = {
#     'n_estimators': [100, 200],
#     'max_depth': [None, 20]
# }

# rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# rf_grid.fit(X_train, y_train)
# print(f"Best Random Forest parameters: {rf_grid.best_params_}")
# best_rf = rf_grid.best_estimator_

# # XGBoost - Focus on n_estimators, learning_rate and max_depth
# xgb_param_grid = {
#     'n_estimators': [100, 200],
#     'learning_rate': [0.01, 0.1],
#     'max_depth': [3, 5]
# }

# xgb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# xgb_grid.fit(X_train, y_train)
# print(f"Best XGBoost parameters: {xgb_grid.best_params_}")
# best_xgb = xgb_grid.best_estimator_

Best SVM parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
Best Random Forest parameters: {'max_depth': 20, 'n_estimators': 200}
Best XGBoost parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}


In [56]:
print("\nPerforming cross-validation...")
y = df['sentiment']
time_scores, user_scores = perform_cross_validation(features_df, y, df)


Performing cross-validation...
Performing Time-based Cross Validation
Time-based CV took 809.85 seconds
Time-based CV Scores: [0.7395895835833434, 0.7353894155766231, 0.7389895595823833, 0.7451098043921757, 0.750030001200048]
Mean Time-based CV Score: 0.7418

Performing User-based Cross Validation
User-based CV took 1443.48 seconds
User-based CV Scores: [0.7465, 0.7482, 0.7482, 0.7419, 0.7423]
Mean User-based CV Score: 0.7454


In [57]:
print("\nAnalyzing misclassified examples...")
best_model_name = max(results.items(), key=lambda x: x[1]['accuracy'])[0]
best_model = results[best_model_name]['model']
misclassified_df = analyze_misclassified_examples(df, X_test, y_test, best_model, idx_test)


Analyzing misclassified examples...
Number of misclassified examples: 2549

Misclassification Analysis by Features:

By Text Length:
text_length_bin
Very Short    0.302079
Short         0.407611
Medium        0.289525
Long          0.000785
Very Long     0.000000
Name: proportion, dtype: float64

By User Tweet Count:
user_tweet_count_bin
Very Few     0.988231
Few          0.010985
Average      0.000785
Many         0.000000
Very Many    0.000000
Name: proportion, dtype: float64

By Hour of Day:
hour_bin
Night        0.314179
Morning      0.274080
Afternoon    0.199669
Evening      0.212071
Name: proportion, dtype: float64

Sample of Misclassified Examples:
Text: @jkldesign I promise to keep you pepped up if you do the same for me...as I have a million things to do also lol. Sorry about the tubes 
True Sentiment: 0, Predicted: 1
--------------------------------------------------
Text: @MissKeriBaby couldn't make it 2 the show  afterparty???
True Sentiment: 0, Predicted: 1
-------------

In [58]:
print("\nCreating visualizations...")
visualize_results(df, results, numerical_pca, numerical_features)

print("\nComparing processing methods...")
performance_results = compare_processing_methods(df)

print("\nDone!")


Creating visualizations...

Comparing processing methods...
Testing Local Python Implementation...
Local processing time: 0.09 seconds

Testing Parallel Processing Implementation...
Using 8 cores
Error in parallel processing: Can't pickle local object 'compare_processing_methods.<locals>.process_chunk'
Please set up a proper distributed environment for actual testing

Done!


<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>