In [1]:
# Train Logistic Regression Model
# Import libraries
import nltk
import pandas as pd
import numpy as np
nltk.download("popular")
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    

In [2]:
# read csv
dataFrame = pd.read_csv('./final1440_aapl.csv')

In [3]:
# display dataframe
dataFrame

Unnamed: 0.1,Unnamed: 0,text,sentiment,label,stock_time
0,0,nape summit week will also feature the annual ...,0.050660,1.0,2017-12-08 05:00:00
1,1,from apple’s hugely anticipated iphone x to sa...,0.107128,-1.0,2017-12-11 05:00:00
2,2,"“they are not just the largest browser, but th...",0.034394,1.0,2017-12-12 05:00:00
3,3,the humanitarian crisis in the drc has placed ...,0.000000,-1.0,2017-12-13 05:00:00
4,4,cramer prefers finisar (nasdaq: fnsr ) after a...,0.032563,1.0,2017-12-15 05:00:00
...,...,...,...,...,...
281,281,"please try advanced search"", ""didyoumeantext"":...",0.017092,1.0,2019-01-28 05:00:00
282,282,"posted by darlene league on jan 28th, 2019 // ...",0.016114,1.0,2019-01-29 05:00:00
283,283,https://www.youtube.com/watch?v=6enwkf1sqog ta...,0.015341,1.0,2019-01-30 05:00:00
284,284,"r. edwardson on jan 30th, 2019 // no comments ...",0.016514,1.0,2019-01-31 05:00:00


In [4]:
# create dataframe and extract text and sentiment
df_text = dataFrame['text']
df_sentiment = dataFrame['sentiment']
df_label_score = dataFrame['label']

# Using CountVectorizer to generate ngrams
# generate ngrams for 2,2
# cv_ngram = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stop_words, ngram_range=(2,2))
# X_count = cv_ngram.fit_transform(df_text)
stop_words_list = list(stop_words)  # Convert the set to a list

# Now, use stop_words_list instead of stop_words
cv_ngram = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stop_words_list, ngram_range=(2,2))
X_count = cv_ngram.fit_transform(df_text)

# TfIdf transform to determine freqency
# add smoothing to avoid zero
tfTransform = TfidfTransformer(smooth_idf=True, use_idf=True)
X_count = tfTransform.fit_transform(X_count).toarray()

# pca dataset for ngrams and sentiment

pca_data = PCA(n_components=3)
pcaComponent = pca_data.fit_transform(X_count)

# combining pca and sentiment
lastXTraining = np.hstack((pcaComponent, np.atleast_2d(df_sentiment).T))

# train and test split
X_train, X_test, y_train, y_test = train_test_split(lastXTraining, df_label_score, test_size=0.3, random_state=23)

# minMax scale the data
minMaxPreprocess = preprocessing.MinMaxScaler()
min_max_x_train = minMaxPreprocess.fit_transform(X_train)
min_max_x_test = minMaxPreprocess.fit_transform(X_test)

In [5]:
# evaluation

# logistic regression model
logistic_regression_model = LogisticRegression(random_state=10)
logistic_regression_model.fit(min_max_x_train,y_train)

# train and test prediction
Y_train_prediction = logistic_regression_model.predict(min_max_x_train)
Y_test_prediction = logistic_regression_model.predict(min_max_x_test)

# print train and test accuracy
trainingAccuracy_Score = accuracy_score(y_train, Y_train_prediction)
print('Accuracy on training data is: {0}'.format(trainingAccuracy_Score))
testAccuracy_Score = accuracy_score(y_test, Y_test_prediction)
print('Accuracy on test data is: {0}'.format(testAccuracy_Score))

# print f-1 score
f1_score_pred = f1_score(y_test, Y_test_prediction)
print('Score for f-1 score: {0}'.format(f1_score_pred))

# print classification report
classify_report = classification_report(y_test, Y_test_prediction)
print('Report for classification: {0}'.format(classify_report))

Accuracy on training data is: 0.545
Accuracy on test data is: 0.5930232558139535
Score for f-1 score: 0.6788990825688074
Report for classification:               precision    recall  f1-score   support

        -1.0       0.40      0.50      0.44        28
         1.0       0.73      0.64      0.68        58

    accuracy                           0.59        86
   macro avg       0.56      0.57      0.56        86
weighted avg       0.62      0.59      0.60        86



In [6]:
# train with ngrams
df_text = dataFrame['text']
df_label_score = dataFrame['label']

# Using CountVectorizer to generate ngrams
# generate ngrams for 2,2
# cv_ngram = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stop_words, ngram_range=(2,2))
# X_count = cv_ngram.fit_transform(df_text)
stop_words_list = list(stop_words)  # Convert the set to a list

# Now, use stop_words_list instead of stop_words
cv_ngram = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stop_words_list, ngram_range=(2,2))
X_count = cv_ngram.fit_transform(df_text)

# TfIdf transform to determine freqency
# add smoothing to avoid zero
tfTransform = TfidfTransformer(smooth_idf=True, use_idf=True)
X_count = tfTransform.fit_transform(X_count).toarray()

# pca dataset for ngrams and sentiment
pca_data = PCA(n_components=3)
lastXTraining = pca_data.fit_transform(X_count)

# train and test split
X_train, X_test, y_train, y_test = train_test_split(lastXTraining, df_label_score, test_size=0.3, random_state=23)

# minMax scale the data
minMaxPreprocess = preprocessing.MinMaxScaler()
min_max_x_train = minMaxPreprocess.fit_transform(X_train)
min_max_x_test = minMaxPreprocess.fit_transform(X_test)

In [7]:
# evaluation

# logistic regression model
logistic_regression_model = LogisticRegression(random_state=10)
logistic_regression_model.fit(min_max_x_train,y_train)

# train and test prediction
Y_train_prediction = logistic_regression_model.predict(min_max_x_train)
Y_test_prediction = logistic_regression_model.predict(min_max_x_test)

# print train and test accuracy
trainingAccuracy_Score = accuracy_score(y_train, Y_train_prediction)
print('Accuracy on training data is: {0}'.format(trainingAccuracy_Score))
testAccuracy_Score = accuracy_score(y_test, Y_test_prediction)
print('Accuracy on test data is: {0}'.format(testAccuracy_Score))

#print f-1 score
f1_score_pred = f1_score(y_test, Y_test_prediction)
print('Score for f-1 score: {0}'.format(f1_score_pred))

#print classification report
classify_report = classification_report(y_test, Y_test_prediction)
print('Report for classification: {0}'.format(classify_report))

Accuracy on training data is: 0.51
Accuracy on test data is: 0.46511627906976744
Score for f-1 score: 0.4888888888888889
Report for classification:               precision    recall  f1-score   support

        -1.0       0.33      0.64      0.44        28
         1.0       0.69      0.38      0.49        58

    accuracy                           0.47        86
   macro avg       0.51      0.51      0.46        86
weighted avg       0.57      0.47      0.47        86

