In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [4]:
# Load the labeled and unlabeled datasets
labeled_df = pd.read_csv('../../data/processed/processed_for_logreg_labeled_data.csv')
unlabeled_df = pd.read_csv('../../data/processed/processed_for_logreg_spx500_unlabeled_data.csv')

In [3]:
labeled_df.head()

Unnamed: 0,newsHeadline,sentiment,preprocessed_txt
0,"According to Gran , the company has no plans t...",neutral,according gran company plan move production ru...
1,Technopolis plans to develop in stages an area...,neutral,technopolis plan develop stage area less 10000...
2,The international electronic industry company ...,negative,international electronic industry company elco...
3,With the new production plant the company woul...,positive,new production plant company increase capacity...
4,According to the company 's updated strategy f...,positive,according company increasedated strategy year ...


In [5]:
unlabeled_df.head()

Unnamed: 0,Headlines,preprocessed_txt
0,"Inflation data, retail sales, Walmart earnings...",inflation data retail sale walmart earnings aw...
1,Whipsaw week for stocks leaves markets 'on edg...,whipsaw stock leaf market edge ahead busy econ...
2,Down Between 12% and 24% From Their 52-Week Hi...,12percent 24percent 52week high 3 magnificent ...
3,Traders Bet on Wild Swings With CPI Print Set ...,trader bet wild swing cpi print set test market
4,"Down 60% This Year, Is Intel Stock a Bargain Buy?",60percent intel stock bargain buy


In [7]:
# Fill missing values in the preprocessed text column with empty strings
labeled_df['preprocessed_txt'] = labeled_df['preprocessed_txt'].fillna('')

# Define the features (X) and the target (y) again
X = labeled_df['preprocessed_txt']
y = labeled_df['encoded_sentiment']

# Split the labeled data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use TF-IDF Vectorizer to convert the text data into numerical features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [23]:
# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train_tfidf, y_train)

# Evaluate the model on the test set (optional)
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=label_map.keys()))


              precision    recall  f1-score   support

    negative       0.79      0.84      0.82       185
     neutral       0.89      0.88      0.88       869
    positive       0.76      0.76      0.76       368

    accuracy                           0.84      1422
   macro avg       0.81      0.82      0.82      1422
weighted avg       0.84      0.84      0.84      1422



In [24]:
# Transform the unlabeled data using the same TF-IDF vectorizer
X_unlabeled_tfidf = vectorizer.transform(unlabeled_df['preprocessed_txt'])

# Predict sentiment using the trained model
unlabeled_df['predicted_sentiment'] = model.predict(X_unlabeled_tfidf)

# Map the numerical predictions back to the textual sentiment labels
reverse_label_map = {v: k for k, v in label_map.items()}
unlabeled_df['predicted_sentiment_label'] = unlabeled_df['predicted_sentiment'].map(reverse_label_map)


In [26]:
# Save the results with the predicted sentiments to a new CSV file
unlabeled_df[['Headlines', 'preprocessed_txt', 'predicted_sentiment_label']].to_csv('predicted_spx500_sentiments.csv', index=False)


In [27]:
unlabeled_df.predicted_sentiment_label.value_counts()

predicted_sentiment_label
neutral     137
positive     41
negative     20
Name: count, dtype: int64

In [29]:
sent_counts = unlabeled_df.predicted_sentiment_label.value_counts()
positive = sent_counts[1]
neutral = sent_counts[0]
negative = sent_counts[2]

In [31]:
sent_val = (positive - negative) / (positive + negative + neutral)

In [32]:
sent_val

0.10606060606060606

In [33]:
sent_val_100scale = ((sent_val + 1) / 2) * 100

In [36]:
with open('../../data/logreg_current_sentiment_value.txt', 'w') as file:
    file.write(str(sent_val_100scale))