In [0]:
import nltk
import pandas as pd
import numpy as np
nltk.download("popular")
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

In [0]:
df = pd.read_csv('./final1440_aapl.csv')

In [0]:
df

Unnamed: 0.1,Unnamed: 0,text,sentiment,label,stock_time
0,0,2018 nape global busi confer held februari 7 2...,0.024667,1.0,2017-12-08 05:00:00
1,1,725 s8 25 iphon 8 perhap ngalaxi s8 best smart...,0.091103,-1.0,2017-12-11 05:00:00
2,2,web publish would wise rememb arent kind bad a...,0.016494,1.0,2017-12-12 05:00:00
3,3,fact cobalt mine mainli byproduct metal copper...,0.013467,-1.0,2017-12-13 05:00:00
4,4,snap nyse snap cramer fan cramer trust doesnt ...,0.031292,1.0,2017-12-15 05:00:00
...,...,...,...,...,...
281,281,2019 tast year got scorcher blaze hot melbourn...,0.012952,1.0,2019-01-28 05:00:00
282,282,aapl topic sever report compani offer iphon li...,0.011818,1.0,2019-01-29 05:00:00
283,283,httpswwwyoutubecomwatchv6enwkf1sqog tag aapl a...,0.009776,1.0,2019-01-30 05:00:00
284,284,compani offer iphon line smartphon ipad line m...,0.009638,1.0,2019-01-31 05:00:00


In [0]:
# train with text and sentiment
text = df['text']
sent = df['sentiment']
labels = df['label']

# generate ngrams = 2,2 or 2,3 with Count Vectorizer
count_vectorizer = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stop_words, ngram_range=(2,2))
X = count_vectorizer.fit_transform(text)

# use TfIdf to determine freq
tfidf = TfidfTransformer(smooth_idf=True, use_idf=True)
X = tfidf.fit_transform(X).toarray()

# use PCA to generate final text dataset
pca = PCA(n_components=3)
pca_comp = pca.fit_transform(X)

# join text and sentiment
finalXtrain = np.hstack((pca_comp, np.atleast_2d(sent).T))

# create train and test split
X_train, X_test, y_train, y_test = train_test_split(finalXtrain, labels, test_size=0.3, random_state=23)

# scale the data
minmaxScaler = preprocessing.MinMaxScaler()
scaled_x_train = minmaxScaler.fit_transform(X_train)
scaled_x_test = minmaxScaler.fit_transform(X_test)

In [0]:
# evaluation for ngrams and sentiment

# create and train LR model
lr_model = LogisticRegression(random_state=10)
lr_model.fit(scaled_x_train,y_train)

# generate prediction
y_train_pred = lr_model.predict(scaled_x_train)
y_test_pred = lr_model.predict(scaled_x_test)

# display train and test accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print('Training accuracy is {0}'.format(train_accuracy))
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Testing accuracy is {0}'.format(test_accuracy))

# display f-1 score
f1score = f1_score(y_test, y_test_pred)
print('F1 score is {0}'.format(f1score))

# display classification report
report = classification_report(y_test, y_test_pred)
print('Classifcation report: {0}'.format(report))

Training accuracy is 0.535
Testing accuracy is 0.6046511627906976
F1 score is 0.7068965517241379
Classifcation report:               precision    recall  f1-score   support

        -1.0       0.39      0.39      0.39        28
         1.0       0.71      0.71      0.71        58

    accuracy                           0.60        86
   macro avg       0.55      0.55      0.55        86
weighted avg       0.60      0.60      0.60        86



In [0]:
# train with ngrams
text = df['text']
labels = df['label']

# generate ngrams = 2,2 or 2,3 with Count Vectorizer
count_vectorizer = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stop_words, ngram_range=(2,2))
X = count_vectorizer.fit_transform(text)

# use TfIdf to determine freq
tfidf = TfidfTransformer(smooth_idf=True, use_idf=True)
X = tfidf.fit_transform(X).toarray()

# use PCA to generate final text dataset
pca = PCA(n_components=3)
finalXtrain = pca.fit_transform(X)

# create train and test split
X_train, X_test, y_train, y_test = train_test_split(finalXtrain, labels, test_size=0.3, random_state=23)

# scale the data
minmaxScaler = preprocessing.MinMaxScaler()
scaled_x_train = minmaxScaler.fit_transform(X_train)
scaled_x_test = minmaxScaler.fit_transform(X_test)

In [0]:
# evaluation for ngrams

# create and train LR model
lr_model = LogisticRegression(random_state=10)
lr_model.fit(scaled_x_train,y_train)

# generate prediction
y_train_pred = lr_model.predict(scaled_x_train)
y_test_pred = lr_model.predict(scaled_x_test)

# display train and test accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print('Training accuracy is {0}'.format(train_accuracy))
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Testing accuracy is {0}'.format(test_accuracy))

# display f-1 score
f1score = f1_score(y_test, y_test_pred)
print('F1 score is {0}'.format(f1score))

# display classification report
report = classification_report(y_test, y_test_pred)
print('Classifcation report: {0}'.format(report))

Training accuracy is 0.5
Testing accuracy is 0.46511627906976744
F1 score is 0.5
Classifcation report:               precision    recall  f1-score   support

        -1.0       0.33      0.61      0.42        28
         1.0       0.68      0.40      0.50        58

    accuracy                           0.47        86
   macro avg       0.50      0.50      0.46        86
weighted avg       0.56      0.47      0.48        86

