In [1]:
import nltk
import pandas as pd
import numpy as np
nltk.download("popular")
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import PCA
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

In [0]:
# load appropriate csv file for time interval
df = pd.read_csv('./final60_amzn.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,text,sentiment,label,stock_time
0,0,like face antitrust review doj run trump appoi...,0.015654,1.0,2017-12-15 18:30:00
1,1,help explain run switch app feel much slower w...,0.022823,1.0,2017-12-21 18:30:00
2,2,china new cybersecur law pass last june requir...,0.002677,-1.0,2017-12-22 00:30:00
3,3,downsid offici support googl play movi tv app ...,0.032610,-1.0,2017-12-22 21:30:00
4,4,new year find tesla almost immedi pressur 2018...,0.064034,1.0,2017-12-27 00:30:00
...,...,...,...,...,...
2038,2038,make sure check today friday finish line hear ...,0.027635,-1.0,2019-02-01 19:30:00
2039,2039,2017 share amazon ran 560 appl return 485 sp 5...,0.021235,1.0,2019-02-01 20:30:00
2040,2040,5 top stock trade first thing monday acb amzn ...,0.014997,-1.0,2019-02-01 21:30:00
2041,2041,top alphabet compet like appl aapl hardwar mar...,0.022944,-1.0,2019-02-01 23:30:00


In [0]:
# train with ngrams and sentiment
text = df['text']
sent = df['sentiment']
labels = df['label']

# generate ngrams = 2,2 or 2,3 with Count Vectorizer
count_vectorizer = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stop_words, ngram_range=(2,2))
X = count_vectorizer.fit_transform(text)

# use TfIdf to determine freq
tfidf = TfidfTransformer(smooth_idf=True, use_idf=True)
X = tfidf.fit_transform(X).toarray()

# use PCA to generate final text dataset
pca = PCA(n_components=3)
pca_comp = pca.fit_transform(X)

# join text and sentiment
finalXtrain = np.hstack((pca_comp, np.atleast_2d(sent).T))

# create train and test split
X_train, X_test, y_train, y_test = train_test_split(finalXtrain, labels, test_size=0.3, random_state=15)

# scale the data
minmaxScaler = preprocessing.MinMaxScaler()
scaled_x_train = minmaxScaler.fit_transform(X_train)
scaled_x_test = minmaxScaler.fit_transform(X_test)

In [5]:
# evaluation for ngrams and sentiment

# create and train SVM model
svm_model = svm.NuSVC(probability=True)
svm_model.fit(scaled_x_train, y_train)

# generate prediction
y_train_pred = svm_model.predict(scaled_x_train)
y_test_pred = svm_model.predict(scaled_x_test)

# display train and test accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print('Training accuracy is {0}'.format(train_accuracy))
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Testing accuracy is {0}'.format(test_accuracy))

# display precision
prec_score = precision_score(y_test, y_test_pred)
print('Precision: {0}'.format(prec_score))

# display recall
rec_score = recall_score(y_test, y_test_pred)
print('Recall: {0}'.format(rec_score))

Training accuracy is 0.5328671328671328
Testing accuracy is 0.5073409461663948
Precision: 0.5609756097560976
Recall: 0.14743589743589744


In [0]:
# train with ngrams only
text = df['text']
labels = df['label']

# generate ngrams = 2,2 or 2,3 with Count Vectorizer
count_vectorizer = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stop_words, ngram_range=(2,2))
X = count_vectorizer.fit_transform(text)

# use TfIdf to determine freq
tfidf = TfidfTransformer(smooth_idf=True, use_idf=True)
X = tfidf.fit_transform(X).toarray()

# use PCA to generate final text dataset
pca = PCA(n_components=3)
finalXtrain = pca.fit_transform(X)

# create train and test split
X_train, X_test, y_train, y_test = train_test_split(finalXtrain, labels, test_size=0.3, random_state=15)

# scale the data
minmaxScaler = preprocessing.MinMaxScaler()
scaled_x_train = minmaxScaler.fit_transform(X_train)
scaled_x_test = minmaxScaler.fit_transform(X_test)

In [7]:
# evaluation for ngrams

# create and train SVM model
svm_model = svm.NuSVC(probability=True)
svm_model.fit(scaled_x_train, y_train)

# generate prediction
y_train_pred = svm_model.predict(scaled_x_train)
y_test_pred = svm_model.predict(scaled_x_test)

# display train and test accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print('Training accuracy is {0}'.format(train_accuracy))
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Testing accuracy is {0}'.format(test_accuracy))

# display precision
prec_score = precision_score(y_test, y_test_pred)
print('Precision: {0}'.format(prec_score))

# display recall
rec_score = recall_score(y_test, y_test_pred)
print('Recall: {0}'.format(rec_score))

Training accuracy is 0.5496503496503496
Testing accuracy is 0.5122349102773246
Precision: 0.5140388768898488
Recall: 0.7628205128205128
