In [2]:
import numpy as np
import pandas as pd
import joblib
import sys
import os 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

sys.path.append('../')
from src.utilities.config_ import train_data_path, scrape_data_path, model_path

In [3]:
# read csv
train_filename = "finance-dataset.csv"
df = pd.read_csv(os.path.join(train_data_path, train_filename))
df.head()

Unnamed: 0,label,title
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [23]:
# split the data into features (X) and labels (y)
X = df['title']
y = df['label']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data, and transform the testing data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [21]:
# Define the parameter grid for SVM
svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Initialize the SVM model
svm_model = SVC()

# Initialize Grid Search with cross-validation
svm_grid_search = GridSearchCV(estimator=svm_model, param_grid=svm_param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit Grid Search to the data
svm_grid_search.fit(X_train_tfidf, y_train)

# Best parameters and score
print("Best SVM parameters:", svm_grid_search.best_params_)
print("Best SVM score:", svm_grid_search.best_score_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.5s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.5s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.5s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.5s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.5s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.5s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.5s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.5s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   0.6s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.7s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   0.6s
[CV] END .....................C=0.1, gamma=scal

In [11]:
# Use the best estimator to make predictions
best_svm_model = svm_grid_search.best_estimator_
y_pred_svm = best_svm_model.predict(X_test_tfidf)

# Evaluate the model
print("SVM Model after fine-tuning")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:")
print(classification_report(y_test, y_pred_svm))

SVM Model after fine-tuning
Accuracy: 0.7742268041237114
Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.58      0.70       110
     neutral       0.76      0.93      0.84       571
    positive       0.78      0.55      0.64       289

    accuracy                           0.77       970
   macro avg       0.80      0.68      0.72       970
weighted avg       0.78      0.77      0.76       970



In [12]:
# Define the parameter grid for Naive Bayes
nb_param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1, 5, 10]
}

# Initialize the Naive Bayes model
nb_model = MultinomialNB()

# Initialize Grid Search with cross-validation
nb_grid_search = GridSearchCV(estimator=nb_model, param_grid=nb_param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit Grid Search to the data
nb_grid_search.fit(X_train_tfidf, y_train)

# Best parameters and score
print("Best Naive Bayes parameters:", nb_grid_search.best_params_)
print("Best Naive Bayes score:", nb_grid_search.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.5; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ........................................

In [13]:
# Use the best estimator to make predictions
best_nb_model = nb_grid_search.best_estimator_
y_pred_nb = best_nb_model.predict(X_test_tfidf)

# Evaluate the model
print("Naive Bayes Model after fine-tuning")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:")
print(classification_report(y_test, y_pred_nb))

Naive Bayes Model after fine-tuning
Accuracy: 0.7350515463917526
Classification Report:
              precision    recall  f1-score   support

    negative       0.75      0.53      0.62       110
     neutral       0.75      0.90      0.82       571
    positive       0.68      0.48      0.57       289

    accuracy                           0.74       970
   macro avg       0.73      0.64      0.67       970
weighted avg       0.73      0.74      0.72       970



In [14]:
# save svm model
svm_file = "svm_model.pkl"
joblib.dump(best_svm_model, os.path.join(model_path, svm_file))

# save naive bayes model
nb_file = "naive_bayes_model.pkl"
joblib.dump(best_nb_model, os.path.join(model_path, nb_file))

['/Users/shabrinashafura/Documents/Code/market-dashboard-ml/model/naive_bayes_model.pkl']

In [15]:
# Save the TF-IDF vectorizer
vectorizer_file = "tfidf_vectorizer.pkl"
joblib.dump(vectorizer, os.path.join(model_path, vectorizer_file))

['/Users/shabrinashafura/Documents/Code/market-dashboard-ml/model/tfidf_vectorizer.pkl']

In [16]:
# Load TFIDF
loaded_svm_model = joblib.load(os.path.join(model_path, svm_file))

# Load the saved TF-IDF vectorizer
vectorizer_file = "tfidf_vectorizer.pkl"
loaded_vectorizer = joblib.load(os.path.join(model_path, vectorizer_file))

In [17]:
# New text data to predict
new_texts = ["Market goes down"]

# Preprocess the new text data
new_texts_tfidf = loaded_vectorizer.transform(new_texts)

In [18]:
# Predict the labels for the new text data
predictions = loaded_svm_model.predict(new_texts_tfidf)

# Output the predictions
print(predictions)

['negative']
