### Research questions
1. prediction problem (classification) and feature weights - simply predict the next sentiment based on generated features, and analyse which feature contributes the most. examples: user-id (same thing) date (time), text (key words).
2. incoperate with LLM to give explanations of why the text is classified as given sentiment.
3. efficient forecasting over large datasets, create a basic model, and compared two ways of processing data. 1, deploy locally and use naive python packages. 2, utilize

In [98]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GroupKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import multiprocessing
import time
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michelletong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michelletong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/michelletong/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [99]:
def load_and_preprocess_data(file_path):
    """
    Load and preprocess the dataset
    """
    # Load the dataset
    df = pd.read_csv(file_path, encoding='latin-1', header=None)
    df.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
    
    # Convert sentiment to binary (0: negative, 1: positive)
    # Assuming sentiment values are 0 and 4 in the original dataset
    df['sentiment'] = df['sentiment'].map({0: 0, 4: 1})
    
    # Convert date to datetime
    df['date'] = pd.to_datetime(df['date'], format='%a %b %d %H:%M:%S PDT %Y')
    
    # Extract basic features from text
    df['text_length'] = df['text'].str.len()
    df['word_count'] = df['text'].str.split().str.len()
    df['hashtag_count'] = df['text'].str.count(r'#')
    df['mention_count'] = df['text'].str.count(r'@')
    df['url_count'] = df['text'].str.count(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\\\\\\\(\\\\\\\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    
    # Extract time-based features
    df['hour'] = df['date'].dt.hour
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    
    return df

def clean_text(text):
    """
    Clean the text data by removing @mentions, URLs, hashtags, punctuation
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs - more comprehensive pattern
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove @mentions - more comprehensive pattern
    text = re.sub(r'@[\w_]+', '', text)
    
    # Remove hashtags (but keep the text after #)
    text = re.sub(r'#(\w+)', r'\1', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove digits (optional - uncomment if needed)
    # text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace (including newlines)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


In [100]:
def debug_text_cleaning(text):
    """
    Function to debug text cleaning process
    """
    print("Original:", text)
    
    # Test URL removal
    text_no_urls = re.sub(r'https?://\S+|www\.\S+', '', text)
    print("After URL removal:", text_no_urls)
    
    # Test @mention removal
    text_no_mentions = re.sub(r'@[\w_]+', '', text_no_urls)
    print("After @mention removal:", text_no_mentions)
    
    # Test hashtag conversion
    text_no_hashtags = re.sub(r'#(\w+)', r'\1', text_no_mentions)
    print("After hashtag conversion:", text_no_hashtags)
    
    # Test punctuation removal
    text_no_punct = re.sub(r'[^\w\s]', '', text_no_hashtags)
    print("After punctuation removal:", text_no_punct)
    
    # Test whitespace cleaning
    text_clean = re.sub(r'\s+', ' ', text_no_punct).strip()
    print("Final cleaned:", text_clean)
    
    return text_clean


In [101]:
def create_visualizations(df):
    # 1. Sentiment Distribution
    plt.figure(figsize=(8, 6))
    sns.countplot(x='sentiment', data=df)
    plt.title('Distribution of Sentiments')
    plt.xlabel('Sentiment')
    plt.ylabel('Count')
    plt.xticks([0, 1], ['Negative', 'Positive'])
    plt.savefig('fig/sentiment_distribution.png')
    plt.close()
    
    # 2. Text Length Distribution by Sentiment
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='sentiment', y='text_length', data=df)
    plt.title('Text Length Distribution by Sentiment')
    plt.xlabel('Sentiment')
    plt.ylabel('Text Length')
    plt.xticks([0, 1], ['Negative', 'Positive'])
    plt.savefig('fig/text_length_distribution.png')
    plt.close()
    
    # 3. Time-based Analysis
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    # Hourly distribution
    sns.countplot(x='hour', hue='sentiment', data=df, ax=axes[0])
    axes[0].set_title('Tweets by Hour of Day')
    axes[0].set_xlabel('Hour')
    axes[0].set_ylabel('Count')
    axes[0].legend(title='Sentiment', labels=['Negative', 'Positive'])
    
    # Day of week distribution
    sns.countplot(x='day_of_week', hue='sentiment', data=df, ax=axes[1])
    axes[1].set_title('Tweets by Day of Week')
    axes[1].set_xlabel('Day of Week (0=Monday)')
    axes[1].set_ylabel('Count')
    axes[1].legend(title='Sentiment', labels=['Negative', 'Positive'])
    
    # Monthly distribution
    sns.countplot(x='month', hue='sentiment', data=df, ax=axes[2])
    axes[2].set_title('Tweets by Month')
    axes[2].set_xlabel('Month')
    axes[2].set_ylabel('Count')
    axes[2].legend(title='Sentiment', labels=['Negative', 'Positive'])
    
    plt.tight_layout()
    plt.savefig('fig/time_based_analysis.png')
    plt.close()
    
    # 4. Feature Correlation Analysis
    plt.figure(figsize=(10, 8))
    correlation_matrix = df[['sentiment', 'text_length', 'word_count', 'hashtag_count', 'mention_count', 'url_count']].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Feature Correlation Matrix')
    plt.savefig('fig/feature_correlation.png')
    plt.close()
    
    # 5. Word Clouds for Positive and Negative Tweets
    def generate_wordcloud(text, title, filename):
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(title)
        plt.savefig(filename)
        plt.close()
    
    # Generate word clouds for positive and negative tweets
    positive_text = ' '.join(df[df['sentiment'] == 1]['text'])
    negative_text = ' '.join(df[df['sentiment'] == 0]['text'])
    
    generate_wordcloud(positive_text, 'Word Cloud for Positive Tweets', 'positive_wordcloud.png')
    generate_wordcloud(negative_text, 'Word Cloud for Negative Tweets', 'negative_wordcloud.png')


In [102]:
def engineer_features(df):
    """
    Engineer features based on the project outline
    """
    # 1. User-based Features
    
    # Group by user and calculate statistics
    user_stats = df.groupby('user')['sentiment'].agg(['mean', 'count', 'std']).reset_index()
    
    # Calculate correct std with n-1 denominator
    def adjusted_std(group):
        if len(group) <= 1:
            return 0
        return np.std(group, ddof=1)  # ddof=1 uses n-1 denominator
    
    user_sentiment_std = df.groupby('user')['sentiment'].apply(adjusted_std)
    user_stats['std'] = user_stats['user'].map(user_sentiment_std)
    
    # Handle case where a user has only one tweet (std is NaN)
    user_stats['std'] = user_stats['std'].fillna(0)
    
    user_stats.columns = ['user', 'user_avg_sentiment', 'user_tweet_count', 'user_sentiment_std']
    
    # Merge user stats back to main dataframe
    df = pd.merge(df, user_stats, on='user', how='left')
    
    # Calculate average posting gap time for each user
    df = df.sort_values(['user', 'date'])
    
    # Function to calculate average time between posts
    def calc_avg_gap(group):
        if len(group) <= 1:
            return pd.Timedelta(0)
        gaps = group['date'].diff().dropna()
        return gaps.mean()
    
    # Calculate average gap for each user
    avg_gaps = df.groupby('user').apply(calc_avg_gap)
    avg_gaps_seconds = avg_gaps.dt.total_seconds()
    avg_gaps_df = pd.DataFrame({
        'user': avg_gaps.index, 
        'avg_posting_gap_seconds': avg_gaps_seconds.values
    })
    
    # Merge gaps back to main dataframe
    df = pd.merge(df, avg_gaps_df, on='user', how='left')
    df['avg_posting_gap_seconds'] = df['avg_posting_gap_seconds'].fillna(0)
    
    # 2. Text Processing Features
    
    # Apply text cleaning
    df['clean_text'] = df['text'].apply(clean_text)
    
    # Create tokenized text for Word2Vec
    df['tokens'] = df['clean_text'].apply(word_tokenize)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    df['tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])
    
    return df

def extract_word2vec_features(df, vector_size=100):
    """
    Extract Word2Vec features
    """
    # Train Word2Vec model
    all_tokens = df['tokens'].tolist()
    w2v_model = Word2Vec(
        sentences=all_tokens,
        vector_size=vector_size,
        window=5,
        min_count=1,
        workers=4
    )
    
    # Function to get document vectors by averaging word vectors
    def get_doc_vector(tokens):
        vec = np.zeros(vector_size)
        count = 0
        for word in tokens:
            try:
                vec += w2v_model.wv[word]
                count += 1
            except KeyError:
                # Word not in vocabulary
                continue
        if count > 0:
            vec /= count
        return vec
    
    # Get document vectors
    doc_vectors = np.array(df['tokens'].apply(get_doc_vector).tolist())
    w2v_df = pd.DataFrame(
        doc_vectors,
        columns=[f'w2v_{i}' for i in range(vector_size)]
    )
    
    return w2v_df, w2v_model

def select_features(features_df, n_components=50):
    """
    Perform PCA for feature selection
    """
    # Initialize PCA
    pca = PCA(n_components=n_components)
    
    # Fit and transform
    pca_features = pca.fit_transform(features_df)
    
    # Convert to DataFrame
    pca_df = pd.DataFrame(
        pca_features, 
        columns=[f'pca_{i}' for i in range(n_components)]
    )
    
    # Calculate explained variance ratio
    explained_variance = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance)
    
    # Print variance explanation
    print(f"Top 10 components explain {cumulative_variance[9]:.2%} of variance")
    print(f"All {n_components} components explain {cumulative_variance[-1]:.2%} of variance")
    
    return pca_df, pca

def train_evaluate_models(X_train, X_test, y_train, y_test):
    """
    Train and evaluate classification models
    """
    # Define base models
    svm = SVC(probability=False, kernel='rbf')
    rf = RandomForestClassifier(n_estimators=100)
    xgb = GradientBoostingClassifier(n_estimators=100)
    
    # Train individual models
    models = {
        'SVM': svm,
        'Random Forest': rf,
        'XGBoost': xgb
    }
    
    results = {}
    
    for name, model in models.items():
        print(f"Training {name}...")
        t0 = time.time()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        class_report = classification_report(y_test, y_pred)
        
        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'confusion_matrix': conf_matrix,
            'classification_report': class_report
        }
        
        print(f"Training {name} took {time.time() - t0:.2f} seconds")
        print(f"{name} Accuracy: {accuracy:.4f}")
        print(f"Confusion Matrix:\n{conf_matrix}")
        print(f"Classification Report:\n{class_report}")
        print("="*50)
    
    # Create Voting Ensemble (majority voting)
    voting_clf = VotingClassifier(
        estimators=[('svm', svm), ('rf', rf), ('xgb', xgb)],
        voting='hard'  # Majority voting
    )
    
    print("Training Ensemble (Majority Voting)...")
    t0 = time.time()
    voting_clf.fit(X_train, y_train)
    y_pred = voting_clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    results['Ensemble'] = {
        'model': voting_clf,
        'accuracy': accuracy,
        'confusion_matrix': conf_matrix,
        'classification_report': class_report
    }
    
    print("Training Ensemble took %0.2f seconds" % (time.time() - t0))
    print(f"Ensemble Accuracy: {accuracy:.4f}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Classification Report:\n{class_report}")
    
    return results

def perform_cross_validation(X, y, df):
    """
    Perform time-based and user-based cross-validation
    """
    # Time-based Cross Validation
    print("Performing Time-based Cross Validation")
    t0 = time.time()
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Use SVM as model for validation
    model = SVC(kernel='rbf')
    
    time_scores = []
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        time_scores.append(score)
    
    print(f"Time-based CV took {time.time() - t0:.2f} seconds")
    print(f"Time-based CV Scores: {time_scores}")
    print(f"Mean Time-based CV Score: {np.mean(time_scores):.4f}")
    
    # User-based Cross Validation
    print("\nPerforming User-based Cross Validation")
    t0 = time.time()
    user_groups = df['user'].astype('category').cat.codes.values
    gkf = GroupKFold(n_splits=5)
    
    user_scores = []
    for train_index, test_index in gkf.split(X, y, groups=user_groups):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        user_scores.append(score)
    
    print(f"User-based CV took {time.time() - t0:.2f} seconds")
    print(f"User-based CV Scores: {user_scores}")
    print(f"Mean User-based CV Score: {np.mean(user_scores):.4f}")
    
    return time_scores, user_scores

def analyze_misclassified_examples(df, X_test, y_test, model, idx_test):
    """
    Analyze misclassified examples
    """
    y_pred = model.predict(X_test)
    misclassified_idx = idx_test[y_pred != y_test]
    
    misclassified_df = df.iloc[misclassified_idx].copy()
    misclassified_df['predicted_sentiment'] = y_pred[y_pred != y_test]
    
    print(f"Number of misclassified examples: {len(misclassified_df)}")
    
    # Analyze by features
    print("\nMisclassification Analysis by Features:")
    
    # By text length
    print("\nBy Text Length:")
    bins = [0, 50, 100, 150, 200, np.inf]
    labels = ['Very Short', 'Short', 'Medium', 'Long', 'Very Long']
    misclassified_df['text_length_bin'] = pd.cut(misclassified_df['text_length'], bins=bins, labels=labels)
    print(misclassified_df['text_length_bin'].value_counts(normalize=True).sort_index())
    
    # By user tweet count
    print("\nBy User Tweet Count:")
    bins = [0, 5, 10, 20, 50, np.inf]
    labels = ['Very Few', 'Few', 'Average', 'Many', 'Very Many']
    misclassified_df['user_tweet_count_bin'] = pd.cut(misclassified_df['user_tweet_count'], bins=bins, labels=labels)
    print(misclassified_df['user_tweet_count_bin'].value_counts(normalize=True).sort_index())
    
    # By time of day
    print("\nBy Hour of Day:")
    hour_bins = [0, 6, 12, 18, 24]
    hour_labels = ['Night', 'Morning', 'Afternoon', 'Evening']
    misclassified_df['hour_bin'] = pd.cut(misclassified_df['hour'], bins=hour_bins, labels=hour_labels)
    print(misclassified_df['hour_bin'].value_counts(normalize=True).sort_index())
    
    # Sample of misclassified examples
    print("\nSample of Misclassified Examples:")
    sample = misclassified_df.sample(min(5, len(misclassified_df)))
    for _, row in sample.iterrows():
        print(f"Text: {row['text']}")
        print(f"True Sentiment: {row['sentiment']}, Predicted: {row['predicted_sentiment']}")
        print("-" * 50)
    
    return misclassified_df

def visualize_results(df, results, pca, feature_names):
    """
    Create visualizations for the analysis
    """
    # 1. PCA Explained Variance
    plt.figure(figsize=(10, 6))
    plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_)
    plt.xlabel('Principal Component')
    plt.ylabel('Explained Variance Ratio')
    plt.title('Explained Variance by Principal Component')
    plt.tight_layout()
    plt.savefig('fig/pca_variance.png')
    plt.close()
    
    # 2. Feature Importance from PCA loadings
    plt.figure(figsize=(12, 8))
    # Get most important features from first component
    component = 0
    loadings = pd.Series(abs(pca.components_[component]), index=feature_names)
    top_features = loadings.nlargest(15)
    
    sns.barplot(x=top_features.values, y=top_features.index)
    plt.title(f'Top 15 Feature Importances (PC {component+1})')
    plt.xlabel('Absolute Loading Value')
    plt.tight_layout()
    plt.savefig('fig/feature_importance.png')
    plt.close()
    
    # 3. Sentiment Distribution by Time of Day
    plt.figure(figsize=(10, 6))
    hour_counts = df.groupby(['hour', 'sentiment']).size().unstack()
    hour_counts.plot(kind='bar', stacked=True)
    plt.title('Sentiment Distribution by Hour of Day')
    plt.xlabel('Hour')
    plt.ylabel('Count')
    plt.legend(['Negative', 'Positive'])
    plt.tight_layout()
    plt.savefig('fig/sentiment_by_hour.png')
    plt.close()
    
    # 4. Sentiment Distribution by Day of Week
    plt.figure(figsize=(10, 6))
    day_counts = df.groupby(['day_of_week', 'sentiment']).size().unstack()
    day_counts.plot(kind='bar', stacked=True)
    plt.title('Sentiment Distribution by Day of Week')
    plt.xlabel('Day of Week (0=Monday)')
    plt.ylabel('Count')
    plt.legend(['Negative', 'Positive'])
    plt.tight_layout()
    plt.savefig('fig/sentiment_by_day.png')
    plt.close()
    
    # 5. User Sentiment Patterns (Top 10 users by tweet count)
    top_users = df['user'].value_counts().head(10).index
    user_df = df[df['user'].isin(top_users)]
    
    plt.figure(figsize=(12, 8))
    user_sentiment = user_df.groupby('user')['sentiment'].mean().sort_values()
    sns.barplot(x=user_sentiment.index, y=user_sentiment.values)
    plt.title('Average Sentiment for Top 10 Users')
    plt.xticks(rotation=45)
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.savefig('fig/user_sentiment.png')
    plt.close()
    
    # 6. Word Clouds by Sentiment
    for sentiment, label in [(0, 'Negative'), (1, 'Positive')]:
        text = ' '.join(df[df['sentiment'] == sentiment]['clean_text'])
        
        wordcloud = WordCloud(
            width=800, height=400,
            background_color='white',
            max_words=200
        ).generate(text)
        
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'Word Cloud for {label} Sentiment')
        plt.tight_layout()
        plt.savefig(f'fig/wordcloud_sentiment_{sentiment}.png')
        plt.close()
    
    # 7. Model Comparison
    accuracies = {name: info['accuracy'] for name, info in results.items()}
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()))
    plt.title('Model Accuracy Comparison')
    plt.xlabel('Model')
    plt.ylabel('Accuracy')
    plt.ylim(0, 1)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('fig/model_comparison.png')
    plt.close()
    
    # 8. Confusion Matrix Visualization
    for name, info in results.items():
        plt.figure(figsize=(8, 6))
        cm = info['confusion_matrix']
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                   xticklabels=['Negative', 'Positive'],
                   yticklabels=['Negative', 'Positive'])
        plt.title(f'Confusion Matrix - {name}')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.tight_layout()
        plt.savefig(f'fig/confusion_matrix_{name}.png')
        plt.close()

def compare_processing_methods(df, test_size=1000):
    """
    Compare local vs distributed processing performance
    """
    # Subset data for testing
    test_df = df.sample(test_size, random_state=42)
    
    # 1. Local Python Implementation
    print("Testing Local Python Implementation...")
    start_time = time.time()
    
    # Simulate local processing
    tokens = test_df['text'].apply(clean_text).apply(word_tokenize).tolist()
    local_time = time.time() - start_time
    print(f"Local processing time: {local_time:.2f} seconds")
    
    # 2. Simulated Distributed Processing
    try:
        print("\nTesting Parallel Processing Implementation...")
        start_time = time.time()
        
        # Determine number of cores
        num_cores = multiprocessing.cpu_count()
        print(f"Using {num_cores} cores")
        
        # Split data into chunks
        chunks = np.array_split(test_df['text'], num_cores)
        
        # Define processing function
        def process_chunk(chunk):
            return [word_tokenize(clean_text(text)) for text in chunk]
        
        # Create a pool and process in parallel
        with multiprocessing.Pool(num_cores) as pool:
            results = pool.map(process_chunk, chunks)
            
        # Flatten results
        parallel_tokens = [item for sublist in results for item in sublist]
        
        parallel_time = time.time() - start_time
        print(f"Parallel processing time: {parallel_time:.2f} seconds")
        print(f"Speedup: {local_time / parallel_time:.2f}x")
        
    except Exception as e:
        print(f"Error in parallel processing: {e}")
        print("Please set up a proper distributed environment for actual testing")
        parallel_time = None
    
    return {'local_time': local_time, 'parallel_time': parallel_time}


In [103]:
print("Loading and preprocessing data...")
df_all = load_and_preprocess_data('sentiment140.csv')
df_all.head()

Loading and preprocessing data...


Unnamed: 0,sentiment,id,date,query,user,text,text_length,word_count,hashtag_count,mention_count,url_count,hour,day_of_week,month
0,0,1467810369,2009-04-06 22:19:45,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",115,19,0,1,1,22,0,4
1,0,1467810672,2009-04-06 22:19:49,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,111,21,0,0,0,22,0,4
2,0,1467810917,2009-04-06 22:19:53,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,89,18,0,1,0,22,0,4
3,0,1467811184,2009-04-06 22:19:57,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,47,10,0,0,0,22,0,4
4,0,1467811193,2009-04-06 22:19:57,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",111,21,0,1,0,22,0,4


In [104]:
df = df_all.sample(10000, replace=False, random_state=42)

In [105]:
print("\nBasic statistics:")
print(f"Dataset shape: {df.shape}")
print(f"Sentiment distribution: {df['sentiment'].value_counts(normalize=True)}")



Basic statistics:
Dataset shape: (10000, 14)
Sentiment distribution: sentiment
0    0.5004
1    0.4996
Name: proportion, dtype: float64


In [106]:
print("\nEngineering features...")
df = engineer_features(df)
df.head()


Engineering features...


Unnamed: 0,sentiment,id,date,query,user,text,text_length,word_count,hashtag_count,mention_count,url_count,hour,day_of_week,month,user_avg_sentiment,user_tweet_count,user_sentiment_std,avg_posting_gap_seconds,clean_text,tokens
0,0,1573020777,2009-04-20 23:24:38,NO_QUERY,00kate00,"thats was the fastest shower of my life, someb...",98,19,0,0,0,23,0,4,0.0,1,0.0,0.0,thats was the fastest shower of my life somebo...,"[thats, fastest, shower, life, somebody, kept,..."
1,1,2016177197,2009-06-03 06:18:47,NO_QUERY,01johnn,in engli8sh... doing CPT again &gt;_&gt; last ...,73,14,0,0,0,6,2,6,1.0,1,0.0,0.0,in engli8sh doing cpt again gt_gt last day cel...,"[engli8sh, cpt, gt_gt, last, day, cell, hours]"
2,0,2047773429,2009-06-05 14:09:32,NO_QUERY,10butterflys,@heavenunaware ouch! did they give anything 4 ...,111,18,0,1,0,14,4,6,0.0,1,0.0,0.0,ouch did they give anything 4 painwhy couldnt ...,"[ouch, give, anything, 4, painwhy, couldnt, to..."
3,0,1695537093,2009-05-04 06:22:40,NO_QUERY,12gaugecows,ahh im gonna go to bed and sleep wats left of ...,65,15,0,0,0,6,0,5,0.0,1,0.0,0.0,ahh im gonna go to bed and sleep wats left of ...,"[ahh, im, gon, na, go, bed, sleep, wats, left,..."
4,1,2177133119,2009-06-15 05:31:02,NO_QUERY,12thCenturyFox,"Tehehe, so many Supernatural fans rather peeve...",123,21,0,0,0,5,0,6,1.0,1,0.0,0.0,tehehe so many supernatural fans rather peeved...,"[tehehe, many, supernatural, fans, rather, pee..."


In [107]:
print("\nExtracting Word2Vec features...")
w2v_df, w2v_model = extract_word2vec_features(df)


Extracting Word2Vec features...


In [108]:
# Combine numeric features with text features, remove highly correlated features (user_avg_sentiment)
numeric_features = df[['text_length', 'word_count', 'hashtag_count', 
                        'mention_count', 'url_count', 'hour', 'day_of_week', 
                        'month', 'user_tweet_count', 
                        'user_sentiment_std', 'avg_posting_gap_seconds']]


In [109]:
correlation_matrix = numeric_features.corr()
high_correlations = (correlation_matrix.abs() > 0.8) & (correlation_matrix.abs() < 1.0)
print(f"Number of highly correlated feature pairs: {high_correlations.sum().sum() // 2}")
correlation_matrix

Number of highly correlated feature pairs: 1


Unnamed: 0,text_length,word_count,hashtag_count,mention_count,url_count,hour,day_of_week,month,user_tweet_count,user_sentiment_std,avg_posting_gap_seconds
text_length,1.0,0.954755,0.084414,0.169021,0.085556,-0.005803,-0.010663,0.00164,0.027241,0.008381,0.022404
word_count,0.954755,1.0,0.050085,0.088909,-0.024535,-0.002258,-0.005395,0.011387,0.017461,0.004821,0.012672
hashtag_count,0.084414,0.050085,1.0,0.006434,-0.002095,-0.030075,0.007988,0.003241,0.007361,-0.004383,0.018234
mention_count,0.169021,0.088909,0.006434,1.0,-0.056854,0.010705,-0.018483,-0.025903,0.093536,0.047612,0.048601
url_count,0.085556,-0.024535,-0.002095,-0.056854,1.0,-0.021421,-0.018023,-0.013231,-0.000214,0.00979,0.028787
hour,-0.005803,-0.002258,-0.030075,0.010705,-0.021421,1.0,0.050252,0.096377,-0.050718,-0.025361,-0.050066
day_of_week,-0.010663,-0.005395,0.007988,-0.018483,-0.018023,0.050252,1.0,-0.260821,0.005455,0.019667,0.013942
month,0.00164,0.011387,0.003241,-0.025903,-0.013231,0.096377,-0.260821,1.0,-0.009186,-0.015644,-0.05263
user_tweet_count,0.027241,0.017461,0.007361,0.093536,-0.000214,-0.050718,0.005455,-0.009186,1.0,0.473877,0.48783
user_sentiment_std,0.008381,0.004821,-0.004383,0.047612,0.00979,-0.025361,0.019667,-0.015644,0.473877,1.0,0.452354


In [110]:

# Reset index to ensure proper concatenation
numeric_features = numeric_features.reset_index(drop=True)
w2v_df = w2v_df.reset_index(drop=True)

# Combine all features
all_features = pd.concat([numeric_features, w2v_df], axis=1)
all_features


Unnamed: 0,text_length,word_count,hashtag_count,mention_count,url_count,hour,day_of_week,month,user_tweet_count,user_sentiment_std,...,w2v_90,w2v_91,w2v_92,w2v_93,w2v_94,w2v_95,w2v_96,w2v_97,w2v_98,w2v_99
0,98,19,0,0,0,23,0,4,1,0.0,...,0.256080,0.095325,-0.005306,0.022826,0.292922,0.182167,0.124928,-0.194928,0.052775,-0.011758
1,73,14,0,0,0,6,2,6,1,0.0,...,0.276297,0.099100,-0.001931,0.026111,0.316984,0.196012,0.133682,-0.203115,0.055438,-0.015974
2,111,18,0,1,0,14,4,6,1,0.0,...,0.154568,0.053209,-0.007994,0.010790,0.172207,0.110940,0.074244,-0.115177,0.033252,-0.007060
3,65,15,0,0,0,6,0,5,1,0.0,...,0.417099,0.146936,-0.006537,0.033354,0.469339,0.289272,0.208701,-0.314249,0.088994,-0.031013
4,123,21,0,0,0,5,0,6,1,0.0,...,0.101580,0.038096,-0.003880,0.007097,0.112110,0.071939,0.049285,-0.079152,0.023295,-0.001952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,86,14,0,0,0,9,5,4,1,0.0,...,0.313657,0.114965,-0.006700,0.027011,0.350745,0.223468,0.150851,-0.231760,0.064528,-0.016540
9996,112,20,0,1,0,22,3,5,1,0.0,...,0.283202,0.101469,-0.002345,0.023188,0.314071,0.202886,0.137725,-0.208752,0.056263,-0.013447
9997,41,7,0,1,0,17,4,6,1,0.0,...,0.280665,0.097638,-0.005766,0.028798,0.318183,0.195324,0.136576,-0.206319,0.061062,-0.012510
9998,58,10,0,1,0,0,4,5,1,0.0,...,0.146276,0.057975,-0.008761,0.009294,0.164181,0.102883,0.070963,-0.110034,0.031665,-0.004307


In [111]:
print("\nPerforming feature selection with PCA...")
feature_names = all_features.columns
pca_df, pca = select_features(all_features)

# Define target variable
y = df['sentiment']

# Split data (keeping track of original indices)
df_index = df.index
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    pca_df, y, df_index, test_size=0.2, random_state=42, stratify=y
)



Performing feature selection with PCA...
Top 10 components explain 100.00% of variance
All 50 components explain 100.00% of variance


In [112]:
print("\nTraining and evaluating models...")
results = train_evaluate_models(X_train, X_test, y_train, y_test)


Training and evaluating models...
Training SVM...
Training SVM took 5.61 seconds
SVM Accuracy: 0.5020
Confusion Matrix:
[[970  31]
 [965  34]]
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.97      0.66      1001
           1       0.52      0.03      0.06       999

    accuracy                           0.50      2000
   macro avg       0.51      0.50      0.36      2000
weighted avg       0.51      0.50      0.36      2000

Training Random Forest...
Training Random Forest took 6.34 seconds
Random Forest Accuracy: 0.6780
Confusion Matrix:
[[676 325]
 [319 680]]
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.68      0.68      1001
           1       0.68      0.68      0.68       999

    accuracy                           0.68      2000
   macro avg       0.68      0.68      0.68      2000
weighted avg       0.68      0.68      0.68      2000

Training XGBoost.

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.1, 0.01]
}

grid = GridSearchCV(SVC(), param_grid, cv=5)
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}")
best_svm = grid.best_estimator_

In [113]:
print("\nPerforming cross-validation...")
time_scores, user_scores = perform_cross_validation(pca_df, y, df)


Performing cross-validation...
Performing Time-based Cross Validation
Time-based CV took 14.15 seconds
Time-based CV Scores: [0.514405762304922, 0.503001200480192, 0.4891956782713085, 0.5132052821128451, 0.5054021608643458]
Mean Time-based CV Score: 0.5050

Performing User-based Cross Validation
User-based CV took 28.79 seconds
User-based CV Scores: [0.516, 0.485, 0.511, 0.502, 0.508]
Mean User-based CV Score: 0.5044


In [114]:
print("\nAnalyzing misclassified examples...")
best_model_name = max(results.items(), key=lambda x: x[1]['accuracy'])[0]
best_model = results[best_model_name]['model']
misclassified_df = analyze_misclassified_examples(df, X_test, y_test, best_model, idx_test)


Analyzing misclassified examples...
Number of misclassified examples: 644

Misclassification Analysis by Features:

By Text Length:
text_length_bin
Very Short    0.324534
Short         0.372671
Medium        0.302795
Long          0.000000
Very Long     0.000000
Name: proportion, dtype: float64

By User Tweet Count:
user_tweet_count_bin
Very Few     1.0
Few          0.0
Average      0.0
Many         0.0
Very Many    0.0
Name: proportion, dtype: float64

By Hour of Day:
hour_bin
Night        0.296173
Morning      0.286190
Afternoon    0.214642
Evening      0.202995
Name: proportion, dtype: float64

Sample of Misclassified Examples:
Text: thinks today will be long, but rewarding  2 more days until A.J.'s wedding!
True Sentiment: 1, Predicted: 0
--------------------------------------------------
Text: just got out....not as bad as I was expecting 
True Sentiment: 1, Predicted: 0
--------------------------------------------------
Text: is still waiting for my number 2 transfer onto my new

In [115]:
print("\nCreating visualizations...")
visualize_results(df, results, pca, feature_names)


Creating visualizations...


<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

In [116]:
print("\nComparing processing methods...")
performance_results = compare_processing_methods(df)


Comparing processing methods...
Testing Local Python Implementation...
Local processing time: 0.08 seconds

Testing Parallel Processing Implementation...
Using 8 cores
Error in parallel processing: Can't pickle local object 'compare_processing_methods.<locals>.process_chunk'
Please set up a proper distributed environment for actual testing


In [117]:
print("\nDone!")


Done!
