In [1]:
# Train Logistic Regression Model
# Import libraries
import nltk
import pandas as pd
import numpy as np
nltk.download("popular")
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    

In [0]:
# read csv
dataFrame = pd.read_csv('./final60_amzn.csv')

In [7]:
# display dataframe
dataFrame

Unnamed: 0.1,Unnamed: 0,text,sentiment,label,stock_time
0,0,like face antitrust review doj run trump appoi...,0.015654,1.0,2017-12-15 18:30:00
1,1,help explain run switch app feel much slower w...,0.022823,1.0,2017-12-21 18:30:00
2,2,china new cybersecur law pass last june requir...,0.002677,-1.0,2017-12-22 00:30:00
3,3,downsid offici support googl play movi tv app ...,0.032610,-1.0,2017-12-22 21:30:00
4,4,new year find tesla almost immedi pressur 2018...,0.064034,1.0,2017-12-27 00:30:00
...,...,...,...,...,...
2038,2038,make sure check today friday finish line hear ...,0.027635,-1.0,2019-02-01 19:30:00
2039,2039,2017 share amazon ran 560 appl return 485 sp 5...,0.021235,1.0,2019-02-01 20:30:00
2040,2040,5 top stock trade first thing monday acb amzn ...,0.014997,-1.0,2019-02-01 21:30:00
2041,2041,top alphabet compet like appl aapl hardwar mar...,0.022944,-1.0,2019-02-01 23:30:00


In [0]:
# create dataframe and extract text and sentiment
df_text = dataFrame['text']
df_sentiment = dataFrame['sentiment']
df_label_score = dataFrame['label']

# Using CountVectorizer to generate ngrams
# generate ngrams for 2,2
cv_ngram = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stop_words, ngram_range=(2,2))
X_count = cv_ngram.fit_transform(df_text)

# TfIdf transform to determine freqency
# add smoothing to avoid zero
tfTransform = TfidfTransformer(smooth_idf=True, use_idf=True)
X_count = tfTransform.fit_transform(X_count).toarray()

# pca dataset for ngrams and sentiment
pca_data = PCA(n_components=3)
pcaComponent = pca_data.fit_transform(X_count)

# combining pca and sentiment
lastXTraining = np.hstack((pcaComponent, np.atleast_2d(df_sentiment).T))

# train and test split
X_train, X_test, y_train, y_test = train_test_split(lastXTraining, df_label_score, test_size=0.3, random_state=15)

# minMax scale the data
minMaxPreprocess = preprocessing.MinMaxScaler()
min_max_x_train = minMaxPreprocess.fit_transform(X_train)
min_max_x_test = minMaxPreprocess.fit_transform(X_test)

In [9]:
# evaluation

# logistic regression model
logistic_regression_model = LogisticRegression(random_state=10)
logistic_regression_model.fit(min_max_x_train,y_train)

# train and test prediction
Y_train_prediction = logistic_regression_model.predict(min_max_x_train)
Y_test_prediction = logistic_regression_model.predict(min_max_x_test)

# print train and test accuracy
trainingAccuracy_Score = accuracy_score(y_train, Y_train_prediction)
print('Accuracy on training data is: {0}'.format(trainingAccuracy_Score))
testAccuracy_Score = accuracy_score(y_test, Y_test_prediction)
print('Accuracy on test data is: {0}'.format(testAccuracy_Score))

# print precision
precision_score = precision_score(y_test, Y_test_prediction)
print('Score of Precision: {0}'.format(precision_score))

#print recall
recall_Score = recall_score(y_test, Y_test_prediction)
print('Score of Recall: {0}'.format(recall_Score))

Training accuracy is 0.5559440559440559
Testing accuracy is 0.5089722675367048
Precision: 0.5089722675367048
Recall: 1.0


In [0]:
# train with ngrams
df_text = dataFrame['text']
df_label_score = dataFrame['label']

# Using CountVectorizer to generate ngrams
# generate ngrams for 2,2
cv_ngram = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stop_words, ngram_range=(2,2))
X_count = cv_ngram.fit_transform(df_text)

# TfIdf transform to determine freqency
# add smoothing to avoid zero
tfTransform = TfidfTransformer(smooth_idf=True, use_idf=True)
X_count = tfTransform.fit_transform(X_count).toarray()

# pca dataset for ngrams and sentiment
pca_data = PCA(n_components=3)
lastXTraining = pca_data.fit_transform(X_count)

# train and test split
X_train, X_test, y_train, y_test = train_test_split(lastXTraining, df_label_score, test_size=0.3, random_state=15)

# minMax scale the data
minMaxPreprocess = preprocessing.MinMaxScaler()
min_max_x_train = minMaxPreprocess.fit_transform(X_train)
min_max_x_test = minMaxPreprocess.fit_transform(X_test)

In [11]:
# evaluation

# logistic regression model
logistic_regression_model = LogisticRegression(random_state=10)
logistic_regression_model.fit(min_max_x_train,y_train)

# train and test prediction
Y_train_prediction = logistic_regression_model.predict(min_max_x_train)
Y_test_prediction = logistic_regression_model.predict(min_max_x_test)

# print train and test accuracy
trainingAccuracy_Score = accuracy_score(y_train, Y_train_prediction)
print('Accuracy on training data is: {0}'.format(trainingAccuracy_Score))
testAccuracy_Score = accuracy_score(y_test, Y_test_prediction)
print('Accuracy on test data is: {0}'.format(testAccuracy_Score))

# print precision
precision_score = precision_score(y_test, Y_test_prediction)
print('Score of Precision: {0}'.format(precision_score))

# print recall
recall_Score = recall_score(y_test, Y_test_prediction)
print('Score of Recall: {0}'.format(recall_Score))

Training accuracy is 0.5559440559440559
Testing accuracy is 0.5089722675367048
Precision: 0.5089722675367048
Recall: 1.0
