#0- Imports

In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
from nltk.sentiment import SentimentIntensityAnalyzer
from collections import Counter
import nltk
import string

In [7]:
# Download necessary NLTK resources if not already downloaded

try:
    nltk.data.find('sentiment/vader_lexicon')
except LookupError:
    nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


#1- counts

In [8]:
def count_mentions(text):
    """Count @mentions in text."""
    mention_pattern = re.compile(r'@\w+')
    return len(mention_pattern.findall(text))

def count_hashtags(text):
    """Count #hashtags in text."""
    hashtag_pattern = re.compile(r'#\w+')
    return len(hashtag_pattern.findall(text))

def count_punctuation(text):
    """Count punctuation marks in text."""
    return sum([1 for char in text if char in string.punctuation])

def count_capital_letters(text):
    """Count capital letters in text."""
    return sum(1 for c in text if c.isupper())

def text_length(text):
    """Calculate text length."""
    return len(text)

def word_count(text):
    """Calculate word count."""
    return len(text.split())

def average_word_length(text):
    """Calculate average word length."""
    words = text.split()
    if not words:
        return 0
    return sum(len(word) for word in words) / len(words)

#2- Sentiment analysis

In [9]:
def sentiment_analysis(text):
    """Perform sentiment analysis on text."""
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    return sentiment

#3- hases

In [10]:
def has_scientific_keywords(text):
    """Check if text contains scientific keywords."""
    scientific_keywords = ['study', 'research', 'science', 'scientist', 'evidence', 'experiment',
                          'data', 'analysis', 'hypothesis', 'theory', 'clinical', 'medical',
                          'findings', 'journal', 'publication', 'published', 'effects']

    return any(keyword in text.lower() for keyword in scientific_keywords)

def has_question(text):
    """Check if text contains a question."""
    return '?' in text

def has_numbers(text):
    """Check if text contains numbers."""
    return bool(re.search(r'\d', text))

#4- create features

In [11]:
def create_bow_features(texts, max_features=1000):
    """Create bag of words features."""
    vectorizer = CountVectorizer(max_features=max_features)
    bow_features = vectorizer.fit_transform(texts)
    return bow_features, vectorizer

def create_tfidf_features(texts, max_features=1000):
    """Create TF-IDF features."""
    vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_features = vectorizer.fit_transform(texts)
    return tfidf_features, vectorizer


#5- extract basic features

In [16]:
def extract_basic_features(df, text_column='text'):
    """Extract basic features from text data."""
    # Create a copy to avoid modifying the original
    feature_df = df.copy()

    # Extract basic features
    feature_df['text_length'] = feature_df[text_column].apply(text_length)
    feature_df['word_count'] = feature_df[text_column].apply(word_count)
    feature_df['avg_word_length'] = feature_df[text_column].apply(average_word_length)
    #feature_df['url_count'] = feature_df[text_column].apply(count_urls)
    feature_df['mention_count'] = feature_df[text_column].apply(count_mentions)
    feature_df['hashtag_count'] = feature_df[text_column].apply(count_hashtags)
    feature_df['punctuation_count'] = feature_df[text_column].apply(count_punctuation)
    feature_df['capital_letter_count'] = feature_df[text_column].apply(count_capital_letters)
    feature_df['has_question'] = feature_df[text_column].apply(has_question)
    feature_df['has_numbers'] = feature_df[text_column].apply(has_numbers)
    feature_df['has_scientific_keywords'] = feature_df[text_column].apply(has_scientific_keywords)

    # Extract sentiment features
    sentiments = feature_df[text_column].apply(sentiment_analysis)
    feature_df['sentiment_negative'] = sentiments.apply(lambda x: x['neg'])
    feature_df['sentiment_neutral'] = sentiments.apply(lambda x: x['neu'])
    feature_df['sentiment_positive'] = sentiments.apply(lambda x: x['pos'])
    feature_df['sentiment_compound'] = sentiments.apply(lambda x: x['compound'])

    return feature_df

#6- feature engineering

In [13]:
def feature_engineering_pipeline(df, text_column='text', max_features=1000):
    """Complete feature engineering pipeline."""
    # Extract basic features
    feature_df = extract_basic_features(df, text_column)

    # Create TF-IDF features
    tfidf_features, tfidf_vectorizer = create_tfidf_features(feature_df[text_column], max_features)

    # Convert sparse matrix to DataFrame
    tfidf_df = pd.DataFrame(
        tfidf_features.toarray(),
        columns=[f'tfidf_{i}' for i in range(tfidf_features.shape[1])],
        index=feature_df.index
    )

    # Combine all features
    final_df = pd.concat([feature_df, tfidf_df], axis=1)

    return final_df, tfidf_vectorizer

#7- main

In [17]:
# Test functions with outputs
if __name__ == "__main__":
    # Load the actual dataset
    print("Loading dataset...")
    data = pd.read_csv('scitweets_export.tsv', sep='\t')

    print("Dataset information:")
    print(f"Shape: {data.shape}")
    print("\nColumns:")
    print(data.columns.tolist())
    print("\nSample data:")
    print(data.head(2))

    # Sample a small subset for testing to avoid memory issues
    sample_size = min(1000, len(data))
    print(f"\nUsing a sample of {sample_size} records for testing...")
    sample_data = data.sample(sample_size, random_state=42)

    # Test individual feature extraction functions on a single example
    if not sample_data.empty:
        text = sample_data['text'].iloc[0]
        print("\nSample text for feature extraction:")
        print(text)

        print(f"\nText length: {text_length(text)}")
        print(f"Word count: {word_count(text)}")
        print(f"Average word length: {average_word_length(text)}")

        print(f"Mention count: {count_mentions(text)}")
        print(f"Hashtag count: {count_hashtags(text)}")
        print(f"Punctuation count: {count_punctuation(text)}")
        print(f"Capital letter count: {count_capital_letters(text)}")
        print(f"Has question: {has_question(text)}")
        print(f"Has numbers: {has_numbers(text)}")
        print(f"Has scientific keywords: {has_scientific_keywords(text)}")

        print("\nSentiment analysis:")
        sentiment = sentiment_analysis(text)
        print(sentiment)

    # Extract basic features
    print("\nExtracting basic features...")
    basic_features_df = extract_basic_features(sample_data)
    print("\nBasic features statistics:")
    print(basic_features_df.describe())

    # Test BoW features with a reasonable number of features
    print("\nCreating Bag of Words features...")
    bow_features, bow_vectorizer = create_bow_features(sample_data['text'], max_features=100)
    print(f"BoW shape: {bow_features.shape}")
    print(f"Top 10 BoW feature names: {bow_vectorizer.get_feature_names_out()[:10]}...")

    # Test TF-IDF features
    print("\nCreating TF-IDF features...")
    tfidf_features, tfidf_vectorizer = create_tfidf_features(sample_data['text'], max_features=100)
    print(f"TF-IDF shape: {tfidf_features.shape}")
    print(f"Top 10 TF-IDF feature names: {tfidf_vectorizer.get_feature_names_out()[:10]}...")

    # Run complete feature engineering pipeline
    print("\nRunning complete feature engineering pipeline...")
    try:
        final_df, _ = feature_engineering_pipeline(sample_data, max_features=100)
        print(f"Final DataFrame shape: {final_df.shape}")
        print("\nFeature categories in final DataFrame:")
        # Group columns by type for better readability
        basic_cols = [col for col in final_df.columns if not col.startswith('tfidf_') and col not in ['tweet_id', 'text']]
        tfidf_cols = [col for col in final_df.columns if col.startswith('tfidf_')]
        original_cols = [col for col in final_df.columns if col in data.columns and col not in ['tweet_id', 'text']]

        print(f"- Original data columns: {len(original_cols)}")
        print(f"- Basic text features: {len(basic_cols)}")
        print(f"- TF-IDF features: {len(tfidf_cols)}")

        # Display summary statistics for some key features
        print("\nSummary statistics for key features:")
        key_features = ['text_length', 'word_count', 'sentiment_compound', 'sentiment_positive', 'sentiment_negative']
        print(final_df[key_features].describe())

        # Check correlation with target variable if it exists in the dataset
        target_columns = [col for col in final_df.columns if col in ['science_related', 'scientific_claim', 'scientific_reference', 'scientific_context']]
        if target_columns:
            print("\nCorrelation with target variables:")
            for target in target_columns:
                print(f"\nTop 10 features correlated with {target}:")
                correlations = final_df.corr()[target].sort_values(ascending=False)
                print(correlations.head(11))  # 11 to include the target itself

    except Exception as e:
        print(f"Error in feature engineering pipeline: {e}")

Loading dataset...
Dataset information:
Shape: (1140, 7)

Columns:
['Unnamed: 0', 'tweet_id', 'text', 'science_related', 'scientific_claim', 'scientific_reference', 'scientific_context']

Sample data:
   Unnamed: 0            tweet_id  \
0           0  316669998137483264   
1           1  319090866545385472   

                                                text  science_related  \
0  Knees are a bit sore. i guess that's a sign th...                0   
1          McDonald's breakfast stop then the gym 🏀💪                0   

   scientific_claim  scientific_reference  scientific_context  
0               0.0                   0.0                 0.0  
1               0.0                   0.0                 0.0  

Using a sample of 1000 records for testing...

Sample text for feature extraction:
It’s phony Christian Sunday. People who support a hateful administration & then attend church are phonies...PERIOD

Text length: 114
Word count: 16
Average word length: 6.1875
Mention count: 