In [1]:
# Train Logistic Regression Model
# Import libraries
import nltk
import pandas as pd
import numpy as np
nltk.download("popular")
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    

In [2]:
dataFrame = pd.read_csv('./final60_amzn.csv')

In [3]:
dataFrame

Unnamed: 0.1,Unnamed: 0,text,sentiment,label,stock_time
0,0,nape summit week will also feature the annual ...,0.050660,-1.0,2017-12-07 20:00:00
1,1,from apples hugely anticipated iphone x to sam...,0.107128,1.0,2017-12-08 22:00:00
2,2,"they are not just the largest browser, but the...",0.034394,1.0,2017-12-12 02:00:00
3,3,the humanitarian crisis in the drc has placed ...,0.000000,-1.0,2017-12-12 22:30:00
4,4,cramer prefers finisar (nasdaq: fnsr ) after a...,0.035844,-1.0,2017-12-14 18:30:00
...,...,...,...,...,...
4247,4247,this report just confirmed apple's china probl...,0.013567,1.0,2019-02-01 22:30:00
4248,4248,apple's (nasdaq: aapl ) spectacular rise and f...,0.017423,-1.0,2019-02-01 23:00:00
4249,4249,"so, lets see what to expect from the entertain...",0.029321,-1.0,2019-02-01 23:30:00
4250,4250,"investorplace.com published on january 30, 20...",0.025605,-1.0,2019-02-02 00:00:00


In [8]:
# create dataframe and extract text and sentiment
df_text = dataFrame['text']
df_sentiment = dataFrame['sentiment']
df_label_score = dataFrame['label']

# Using CountVectorizer to generate ngrams
# generate ngrams for 2,2
# cv_ngram = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stop_words, ngram_range=(2,2))
# X_count = cv_ngram.fit_transform(df_text)
stop_words_list = list(stop_words)  # Convert the set to a list
# Directly fill NaN values for the Series
df_text = df_text.fillna('')

# Now, use stop_words_list instead of stop_words
cv_ngram = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stop_words_list, ngram_range=(2,2))
X_count = cv_ngram.fit_transform(df_text)

# TfIdf transform to determine freqency
# add smoothing to avoid zero
tfTransform = TfidfTransformer(smooth_idf=True, use_idf=True)
X_count = tfTransform.fit_transform(X_count).toarray()

# pca dataset for ngrams and sentiment

pca_data = PCA(n_components=3)
pcaComponent = pca_data.fit_transform(X_count)

# combining pca and sentiment
lastXTraining = np.hstack((pcaComponent, np.atleast_2d(df_sentiment).T))

# train and test split
X_train, X_test, y_train, y_test = train_test_split(lastXTraining, df_label_score, test_size=0.3, random_state=23)

# minMax scale the data
minMaxPreprocess = preprocessing.MinMaxScaler()
min_max_x_train = minMaxPreprocess.fit_transform(X_train)
min_max_x_test = minMaxPreprocess.fit_transform(X_test)

In [10]:

# evaluation

# logistic regression model
logistic_regression_model = LogisticRegression(random_state=10)
logistic_regression_model.fit(min_max_x_train,y_train)

# train and test prediction
Y_train_prediction = logistic_regression_model.predict(min_max_x_train)
Y_test_prediction = logistic_regression_model.predict(min_max_x_test)

# print train and test accuracy
trainingAccuracy_Score = accuracy_score(y_train, Y_train_prediction)
print('Accuracy on training data is: {0}'.format(trainingAccuracy_Score))
testAccuracy_Score = accuracy_score(y_test, Y_test_prediction)
print('Accuracy on test data is: {0}'.format(testAccuracy_Score))



# display precision
prec_score = precision_score(y_test, Y_test_prediction)
print('Precision: {0}'.format(prec_score))

# display recall
rec_score = recall_score(y_test, Y_test_prediction)
print('Recall: {0}'.format(rec_score))

Accuracy on training data is: 0.5164650537634409
Accuracy on test data is: 0.5329153605015674
Precision: 0.5383928571428571
Recall: 0.8841642228739003


In [12]:

# train with ngrams
df_text = dataFrame['text']
df_label_score = dataFrame['label']

# Using CountVectorizer to generate ngrams
# generate ngrams for 2,2
# cv_ngram = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stop_words, ngram_range=(2,2))
# X_count = cv_ngram.fit_transform(df_text)
stop_words_list = list(stop_words)  # Convert the set to a list
# Directly fill NaN values for the Series
df_text = df_text.fillna('')

# Now, use stop_words_list instead of stop_words
cv_ngram = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stop_words_list, ngram_range=(2,2))
X_count = cv_ngram.fit_transform(df_text)

# TfIdf transform to determine freqency
# add smoothing to avoid zero
tfTransform = TfidfTransformer(smooth_idf=True, use_idf=True)
X_count = tfTransform.fit_transform(X_count).toarray()

# pca dataset for ngrams and sentiment
pca_data = PCA(n_components=3)
lastXTraining = pca_data.fit_transform(X_count)

# train and test split
X_train, X_test, y_train, y_test = train_test_split(lastXTraining, df_label_score, test_size=0.3, random_state=23)

# minMax scale the data
minMaxPreprocess = preprocessing.MinMaxScaler()
min_max_x_train = minMaxPreprocess.fit_transform(X_train)
min_max_x_test = minMaxPreprocess.fit_transform(X_test)

In [13]:

# evaluation

# logistic regression model
logistic_regression_model = LogisticRegression(random_state=10)
logistic_regression_model.fit(min_max_x_train,y_train)

# train and test prediction
Y_train_prediction = logistic_regression_model.predict(min_max_x_train)
Y_test_prediction = logistic_regression_model.predict(min_max_x_test)

# print train and test accuracy
trainingAccuracy_Score = accuracy_score(y_train, Y_train_prediction)
print('Accuracy on training data is: {0}'.format(trainingAccuracy_Score))
testAccuracy_Score = accuracy_score(y_test, Y_test_prediction)
print('Accuracy on test data is: {0}'.format(testAccuracy_Score))

# display precision
prec_score = precision_score(y_test, Y_test_prediction)
print('Precision: {0}'.format(prec_score))

# display recall
rec_score = recall_score(y_test, Y_test_prediction)
print('Recall: {0}'.format(rec_score))

Accuracy on training data is: 0.5127688172043011
Accuracy on test data is: 0.5470219435736677
Precision: 0.5866666666666667
Recall: 0.5161290322580645
