# **Text mining: SENTIMENT ANALYSIS**

## 🎓 Master’s Program in Data Science & Advanced Analytics  
**Nova IMS** | March 2025  
**Course:** Business Cases with Data Science

## 👥 Team **Group 34**  
- **[Philippe Dutranoit]** | [20240518]  
- **[Diogo Duarte]** | [20240525]  
- **[Rui luz]** | [20211628]  
- **[Rodrigo Sardinha]** | [20211627]  

## 📊 Goal of the notebook

This notebook focuses on feature selection and engineering for our text-mining project: predicting market sentiment (Bearish, Bullish, Neutral) from Twitter data.  


# Imports

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score

In [3]:
X_train = pd.read_csv('../Data/X_train.csv')
y_train = pd.read_csv('../Data/y_train.csv')
X_test = pd.read_csv('../Data/X_val.csv')
y_test = pd.read_csv('../Data/y_val.csv')

# Prepocessing 

In [6]:
def preprocess_text_series(X_train, X_val):
    """
    Preprocesses X_train and X_val by:
    - Lowercasing
    - Removing punctuation
    - Removing digits
    - Removing stopwords
    
    Returns:
    - Cleaned X_train and X_val as pandas Series
    """
    
    def clean_text(text):
        text = str(text).lower()  # Convert to string and lowercase
        text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
        text = re.sub(r'\d+', '', text)  # Remove digits
        tokens = text.split()
        tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]  # Remove stopwords
        return ' '.join(tokens)

    X_train_clean = X_train.apply(clean_text)
    X_val_clean = X_val.apply(clean_text)
    
    return X_train_clean, X_val_clean

In [7]:
X_train, X_val = preprocess_text_series(X_train, X_test)

In [8]:
X_train

Unnamed: 0                             unnamed length dtype int
text          nasdaq prices m senior notes futures httpstcod...
dtype: object

# Model prep 

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score

def train_xgboost_with_grid_search(X_train, y_train, X_val, y_val):
    """
    Trains an XGBoost classifier using GridSearchCV with f1_macro scoring.
    
    Parameters:
    - X_train, y_train: training features and labels
    - X_val, y_val: validation features and labels
    - verbose: whether to print evaluation results
    
    Returns:
    - best_model: trained model with best parameters
    - best_params: the best hyperparameter combination
    """
    
    # Define the model
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

    # Define parameter grid
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5],
        'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 1.0]
    }

    # Grid search using f1_macro
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='f1_macro',
        cv=3,
        n_jobs=-1,
        verbose=0
    )

    # Fit on training data
    grid_search.fit(X_train, y_train)

    # Get best model and params
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Evaluate on validation set
    y_pred = best_model.predict(X_val)
    f1 = f1_score(y_val, y_pred, average='macro')

    print("Best Parameters:", best_params)
    print(f"Validation F1 Macro Score: {f1:.4f}")
    print("\nClassification Report:\n", classification_report(y_val, y_pred))

    return best_model, best_params

# Model training

# Model Training 