In [56]:
import nltk
import pandas as pd
import numpy as np
nltk.download("popular") 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import PCA
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
stopWords = list(set(stopwords.words('english')))# remove repeated words and then convert to list as the input demands it in list format

[nltk_data] Error loading popular: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


In [57]:
# stopWords

In [58]:

df = pd.read_csv('./final1440_aapl.csv')

In [59]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,sentiment,label,stock_time
0,0,nape summit week will also feature the annual ...,0.05066,1.0,2017-12-08 05:00:00
1,1,from apple’s hugely anticipated iphone x to sa...,0.107128,-1.0,2017-12-11 05:00:00
2,2,"“they are not just the largest browser, but th...",0.034394,1.0,2017-12-12 05:00:00
3,3,the humanitarian crisis in the drc has placed ...,0.0,-1.0,2017-12-13 05:00:00
4,4,cramer prefers finisar (nasdaq: fnsr ) after a...,0.032563,1.0,2017-12-15 05:00:00


In [60]:
# Extracting the text, sentiment, and label columns from the dataframe to use as features and targets.
# 'paragraph' will serve as the main feature set for text analysis.
# 'sentiments' and 'labels' are used as additional features and classification targets, respectively.
paragraph = df['text']
sentiments = df['sentiment']
labels = df['label']

# Initializing CountVectorizer to convert the text documents into a matrix of token counts.
# It's configured to consider only those terms that appear in less than 80% of the documents and in at least 2% of the documents,
# applying English stop words filtering and focusing on bi-grams to capture phrases.
vectCount = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stopWords, ngram_range=(2,2))
X = vectCount.fit_transform(paragraph)  # Transforming the 'paragraph' into a matrix of token counts.

# Initializing TfidfTransformer to convert the count matrix from CountVectorizer into a normalized tf or tf-idf representation.
# It smooths idf weights by adding one to document frequencies (as though an extra document was seen containing every term in the collection),
# enabling the use of inverse document frequency reweighting.
TransformTfidf = TfidfTransformer(smooth_idf=True, use_idf=True)
X = TransformTfidf.fit_transform(X).toarray()  # Applying TF-IDF transformation to the matrix of token counts.

# Initializing PCA (Principal Component Analysis) to reduce the dimensionality of the data to 3 components.
# This step simplifies the dataset while retaining most of the variance, making it easier to visualize and process.
pca = PCA(n_components=3)
pcaComponents = pca.fit_transform(X)  # Transforming the TF-IDF matrix to its top 3 principal components.

# Combining PCA components with the 'sentiments' column to enrich the feature set.
# This is achieved by horizontally stacking the PCA components and the sentiment values, adding sentiment as an additional feature.
XTrainFinal = np.hstack((pcaComponents, np.atleast_2d(sentiments).T))

# Splitting the dataset into training and testing sets, with 70% of the data used for training and 30% for testing.
# The split is performed randomly, with a fixed seed (random_state=23) to ensure reproducibility.
XTrain, XTest, yTrain, yTest = train_test_split(XTrainFinal, labels, test_size=0.3, random_state=23)

# Initializing MinMaxScaler to scale the feature set within a range, typically [0, 1].
# This normalization is important for models that are sensitive to the magnitude of features.
minmaxScaler = preprocessing.MinMaxScaler()
scaledXTrain = minmaxScaler.fit_transform(XTrain)  # Scaling the training data
scaledXTest = minmaxScaler.fit_transform(XTest)    # Scaling the testing data


In [61]:
# Train the model:
# Initializes an SVM model using the Nu Support Vector Classification method. 
# The 'gamma' parameter set to 'scale' automatically uses 1 / (n_features * X.var()) as the value of gamma,
# 'probability=True' enables probability estimates. This model is then fitted to the scaled training data
# along with the corresponding training labels, effectively 'learning' from this data.
SVMModel = svm.NuSVC(gamma='scale', probability=True)
SVMModel.fit(scaledXTrain, yTrain)

# Make predictions:
# With the model trained, it's now used to predict the labels for both the training data itself and the unseen test data.
# This step evaluates how well the model has learned and how it generalizes to new, unseen data.
yTrainPred = SVMModel.predict(scaledXTrain)
yTestPred = SVMModel.predict(scaledXTest)

# Calculate accuracy:
# Calculates the accuracy of the predictions against the actual labels for both training and testing datasets.
# Accuracy is the fraction of predictions our model got right. Additionally, the F1 score is calculated for the test predictions,
# which considers both precision and recall to measure the test's accuracy more comprehensively.
trainingAccuracy = accuracy_score(yTrain, yTrainPred)
testAccuracy = accuracy_score(yTest, yTestPred)
f1score = f1_score(yTest, yTestPred)

# Generate classification report:
# Generates a detailed classification report for the test predictions, providing key metrics such as precision, recall,
# f1-score, and support for each class. This offers a more in-depth look at how well the model performs, especially in
# a multi-class classification scenario.
finalResults = classification_report(yTest, yTestPred)

# Printing results:
# Prints the calculated training and testing accuracy, and the F1 score, formatted to two decimal places for readability.
# Then, it prints the full classification report to provide a comprehensive overview of the model's performance across
# all considered classes.
print(f'Training Accuracy: {trainingAccuracy:.2f}')
print(f'Testing Accuracy: {testAccuracy:.2f}')
print(f'F1 Score: {f1score:.2f}\n')
print('Classification Report:\n', finalResults)


Training Accuracy: 0.79
Testing Accuracy: 0.53
F1 Score: 0.66

Classification Report:
               precision    recall  f1-score   support

        -1.0       0.29      0.29      0.29        28
         1.0       0.66      0.66      0.66        58

    accuracy                           0.53        86
   macro avg       0.47      0.47      0.47        86
weighted avg       0.53      0.53      0.53        86



In [62]:
# Extracting the 'text' and 'label' columns from the dataframe to use as data and labels respectively
paragraph = df['text']
labels = df['label']

# Initializing CountVectorizer with specific parameters to convert text documents to a matrix of token counts.
# Only considering terms that appear in less than 80% of the documents and in at least 2% of the documents.
# Also using bi-grams and removing predefined stop words.
vectCount = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stopWords, ngram_range=(2,2))
X = vectCount.fit_transform(paragraph)  # Fitting and transforming the paragraphs to a term frequency matrix

# Initializing TfidfTransformer to transform the count matrix to a normalized tf-idf representation.
# Setting smooth_idf=True to add one to document frequencies, as if an extra document was seen containing every term in the collection.
# Setting use_idf=True to enable inverse-document-frequency reweighting.
TransformTfidf = TfidfTransformer(smooth_idf=True, use_idf=True)
X = TransformTfidf.fit_transform(X).toarray()  # Applying TF-IDF transformation

# Initializing PCA to reduce the dimensionality of the tf-idf matrix to 3 components for visualization or further processing.
pca = PCA(n_components=3)
XTrainFinal = pca.fit_transform(X)  # Applying PCA transformation

# Splitting the dataset into training and testing sets with 70% of the data used for training and 30% for testing.
# The split is done randomly with a fixed random state for reproducibility.
XTrain, XTest, yTrain, yTest = train_test_split(XTrainFinal, labels, test_size=0.3, random_state=23)

# Initializing MinMaxScaler to scale the features of the dataset to a given range (default 0 to 1).
# This is important for algorithms that are sensitive to the scale of the data.
minmaxScaler = preprocessing.MinMaxScaler()
scaledXTrain = minmaxScaler.fit_transform(XTrain)  # Scaling training data
scaledXTest = minmaxScaler.fit_transform(XTest)    # Scaling testing data


In [63]:
# Initialize an SVM model with NuSVC, setting gamma to 'scale' for automatic gamma calculation based on the data,
# and enabling probability estimates.
SVMModel = svm.NuSVC(gamma='scale', probability=True)

# Fit the SVM model to the scaled training data and the corresponding labels.
SVMModel.fit(scaledXTrain, yTrain)

# Predict the labels for the training data using the trained SVM model to evaluate overfitting.
yTrainPred = SVMModel.predict(scaledXTrain)
# Predict the labels for the testing data using the trained SVM model for evaluation.
yTestPred = SVMModel.predict(scaledXTest)

# Calculate and print the training accuracy to evaluate how well the model has learned from the training dataset.
trainingAccuracy = accuracy_score(yTrain, yTrainPred)


# Calculate and print the testing accuracy to evaluate how well the model performs on unseen data.
testAccuracy = accuracy_score(yTest, yTestPred)


# Calculate and print the F1 score for the test data predictions to evaluate the balance between precision and recall.
f1score = f1_score(yTest, yTestPred)


# Generate and print a classification report for the test data predictions, providing a detailed evaluation
# including precision, recall, f1-score, and support for each class.
finalResults = classification_report(yTest, yTestPred)


print(f'Training accuracy: {trainingAccuracy:.2f}')
print(f'Testing accuracy: {testAccuracy:.2f}')
print(f'F1 Score: {f1score:.2f}\n')
print(f'Classifcation report:\n', finalResults)

Training accuracy: 0.70
Testing accuracy: 0.52
F1 Score: 0.59

Classifcation report:
               precision    recall  f1-score   support

        -1.0       0.35      0.54      0.42        28
         1.0       0.70      0.52      0.59        58

    accuracy                           0.52        86
   macro avg       0.52      0.53      0.51        86
weighted avg       0.58      0.52      0.54        86

