In [1]:
import nltk
import pandas as pd
import numpy as np
nltk.download("popular") 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import PCA
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
stopWords = list(set(stopwords.words('english')))# remove repeated words and then convert to list as the input demands it in list format

[nltk_data] Error loading popular: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


In [2]:
df = pd.read_csv('./final60_amzn.csv')
# df = pd.read_csv('./final1440_aapl.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,sentiment,label,stock_time
0,0,nape summit week will also feature the annual ...,0.05066,-1.0,2017-12-07 20:00:00
1,1,from apples hugely anticipated iphone x to sam...,0.107128,1.0,2017-12-08 22:00:00
2,2,"they are not just the largest browser, but the...",0.034394,1.0,2017-12-12 02:00:00
3,3,the humanitarian crisis in the drc has placed ...,0.0,-1.0,2017-12-12 22:30:00
4,4,cramer prefers finisar (nasdaq: fnsr ) after a...,0.035844,-1.0,2017-12-14 18:30:00


In [4]:
# paragraph = df['text']
# print(type(paragraph))

In [5]:
# Data Preparation:
# Extract the main text content, sentiment scores, and labels from the dataframe.
# 'paragraph' will be used for text analysis, 'sentiments' as an additional feature, and 'labels' as the target for classification.
df['text'] = df['text'].fillna('')  # Replacing NaN values with empty strings.

paragraph = df['text']
sentiments = df['sentiment']
labels = df['label']

# Text Vectorization:
# Initialize a CountVectorizer to transform the text paragraphs into a sparse matrix of token counts.
# Configure it to ignore terms that have a document frequency strictly higher than 80% and lower than 2%,
# remove stop words, and capture bi-grams to consider phrases as well.
vectCount = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stopWords, ngram_range=(2,2))
X = vectCount.fit_transform(paragraph)  # Apply transformation to the paragraph data.

# TF-IDF Transformation:
# Initialize a TfidfTransformer to convert the sparse matrix of counts to a normalized tf or tf-idf representation,
# making the value of each word proportional to its importance in the documents.
TransformTfidf = TfidfTransformer(smooth_idf=True, use_idf=True)
X = TransformTfidf.fit_transform(X).toarray()  # Apply TF-IDF transformation.

# Dimensionality Reduction:
# Apply PCA to reduce the transformed data to 3 principal components,
# aiming to retain the most important variance features while reducing the dataset's dimensionality.
pca = PCA(n_components=3)
pcaComponents = pca.fit_transform(X)  # Fit and transform the TF-IDF data.

# Feature Augmentation:
# Combine the PCA components with sentiment scores to form the final feature set.
# This step enhances the feature set by adding sentiment analysis results as an additional dimension.
XTrainFinal = np.hstack((pcaComponents, np.atleast_2d(sentiments).T))

# Data Splitting:
# Split the enhanced feature set and labels into training and testing sets.
# Use 70% of the data for training and 30% for testing, with a random state for reproducibility.
XTrain, XTest, yTrain, yTest = train_test_split(XTrainFinal, labels, test_size=0.3, random_state=15)

# Feature Scaling:
# Initialize and apply MinMaxScaler to scale the features of the training and testing data to a [0, 1] range.
# Scaling is crucial for models sensitive to the magnitude of input features, ensuring equal contribution to the model's learning process.
minmaxScaler = preprocessing.MinMaxScaler()
scaledXTrain = minmaxScaler.fit_transform(XTrain)  # Scale the training data features.
scaledXTest = minmaxScaler.fit_transform(XTest)    # Scale the testing data features.


In [6]:
# Model Initialization and Training:
# Initialize the SVM model using the Nu-Support Vector Classification algorithm with probability estimates enabled.
# This configuration is suitable for binary classification tasks or multi-class classification on a dataset.
# The model is then trained on the scaled training dataset along with the corresponding training labels.
SVMModel = svm.NuSVC(probability=True)
SVMModel.fit(scaledXTrain, yTrain)

# Prediction:
# After training, the model is used to predict the labels for both the training and testing datasets.
# This allows us to evaluate the model's performance on both seen (training) and unseen (testing) data.
yTrainPred = SVMModel.predict(scaledXTrain)
yTestPred = SVMModel.predict(scaledXTest)

# Accuracy Calculation:
# Calculate the accuracy of the model on both the training and testing sets.
# Accuracy is the proportion of true results (both true positives and true negatives) in the total number of cases examined.
# It provides a measure of how well the model performs overall across all classes.
trainingAccuracy = accuracy_score(yTrain, yTrainPred)
testAccuracy = accuracy_score(yTest, yTestPred)

# Precision and Recall Calculation:
# Precision is the ratio of true positives to the sum of true and false positives.
# It indicates the accuracy of positive predictions made by the model.
precisionScore = precision_score(yTest, yTestPred)

# Recall (or sensitivity) is the ratio of true positives to the sum of true positives and false negatives.
# It measures the model's ability to detect positive instances among the actual positives.
recallScore = recall_score(yTest, yTestPred)

# Output:
# Print the calculated metrics to evaluate and compare the model's performance on the training and testing sets.
# High precision and recall scores indicate a model that is both accurate and reliable in its positive classifications.
print(f'Training accuracy: {trainingAccuracy:.2f}')
print(f'Testing accuracy: {testAccuracy:.2f}')
print(f'Precision: {precisionScore:.2f}')
print(f'Recall: {recallScore:.2f}')


Training accuracy: 0.53
Testing accuracy: 0.49
Precision: 0.52
Recall: 0.61


In [7]:
# Data Extraction:
# Extract text content and corresponding labels from the dataframe to use as input data and target labels respectively.
paragraph = df['text']
labels = df['label']

# Text Vectorization:
# Initialize CountVectorizer to convert the collection of text documents into a matrix of token counts,
# filtering out terms that appear too frequently or too infrequently across the documents.
# This setup also focuses on bi-grams to capture two-word combinations, enriching the feature set.
vectCount = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stopWords, ngram_range=(2,2))
X = vectCount.fit_transform(paragraph)  # Transform 'paragraph' into a matrix of bi-gram counts.

# TF-IDF Transformation:
# Initialize TfidfTransformer to convert the raw frequency counts into TF-IDF values,
# providing a reflection of the importance of words in the context of the entire dataset.
# The transformation to a dense array format prepares data for dimensionality reduction.
TransformTfidf = TfidfTransformer(smooth_idf=True, use_idf=True)
X = TransformTfidf.fit_transform(X).toarray()  # Apply TF-IDF transformation.

# Dimensionality Reduction with PCA:
# Apply Principal Component Analysis (PCA) to reduce the dimensionality of the feature set to 3 principal components.
# This step simplifies the dataset, aiming to retain the most significant variance while reducing the complexity.
pca = PCA(n_components=3)
XTrainFinal = pca.fit_transform(X)  # Fit and transform the TF-IDF matrix to 3 dimensions.

# Dataset Splitting:
# Split the processed data into training and testing sets to evaluate the model's performance on unseen data.
# A test size of 30% provides a substantial amount of data for both training and evaluation, with a random state for reproducibility.
XTrain, XTest, yTrain, yTest = train_test_split(XTrainFinal, labels, test_size=0.3, random_state=15)

# Feature Scaling:
# Initialize MinMaxScaler to scale the features to a specified range (default is [0, 1]).
# Scaling is crucial for many algorithms that are sensitive to the magnitude of the feature values.
minmaxScaler = preprocessing.MinMaxScaler()
scaledXTrain = minmaxScaler.fit_transform(XTrain)  # Scale training features.
scaledXTest = minmaxScaler.fit_transform(XTest)    # Scale testing features.


In [8]:
# Model Training:
# Initialize a Support Vector Machine model specifically using the NuSVC algorithm, 
# which is similar to SVC but uses a parameter to control the number of support vectors.
# The 'probability=True' argument enables probability estimates for class predictions.
# The model is then trained on the preprocessed and scaled training data along with their corresponding labels.
SVMModel = svm.NuSVC(probability=True)
SVMModel.fit(scaledXTrain, yTrain)

# Prediction:
# With the model trained, use it to make predictions on both the training set (to evaluate overfitting)
# and the test set (to evaluate the model's generalization performance).
yTrainPred = SVMModel.predict(scaledXTrain)  # Predictions on training data.
yTestPred = SVMModel.predict(scaledXTest)    # Predictions on test data.

# Performance Evaluation:
# Calculate the accuracy of the model on both the training data and the test data.
# Accuracy is the proportion of correctly predicted observations to the total observations and gives
# a general idea of how often the model is correct.
trainingAccuracy = accuracy_score(yTrain, yTrainPred)  # Training accuracy.
testAccuracy = accuracy_score(yTest, yTestPred)        # Test accuracy.

# Precision and Recall:
# Calculate precision and recall scores to further evaluate the model.
# Precision (the proportion of positive identifications that were actually correct) assesses the model's result relevancy,
# while recall (the proportion of actual positives that were identified correctly) assesses the model's ability to find all relevant instances.
precisionScore = precision_score(yTest, yTestPred)  # Precision score.
recallScore = recall_score(yTest, yTestPred)        # Recall score.

# Results Output:
# Print the calculated metrics to provide insights into the model's performance.
# These metrics together give a more nuanced view of the model's effectiveness beyond simple accuracy.
print(f'Training accuracy: {trainingAccuracy:.2f}')  # Displays training accuracy as a percentage rounded to two decimal places.
print(f'Testing accuracy: {testAccuracy:.2f}')       # Displays testing accuracy as a percentage.
print(f'Precision: {precisionScore:.2f}')            # Displays precision score.
print(f'Recall: {recallScore:.2f}')                  # Displays recall score.


Training accuracy: 0.54
Testing accuracy: 0.51
Precision: 0.54
Recall: 0.46
