In [1]:
# Train Logistic Regression Model
# Import libraries
import nltk
import pandas as pd
import numpy as np
nltk.download("popular")
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/pranavmittal/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.

In [0]:
# read csv
dataFrame = pd.read_csv('./final1440_aapl.csv')

In [0]:
# display dataframe
dataFrame

Unnamed: 0.1,Unnamed: 0,text,sentiment,label,stock_time
0,0,2018 nape global busi confer held februari 7 2...,0.024667,1.0,2017-12-08 05:00:00
1,1,725 s8 25 iphon 8 perhap ngalaxi s8 best smart...,0.091103,-1.0,2017-12-11 05:00:00
2,2,web publish would wise rememb arent kind bad a...,0.016494,1.0,2017-12-12 05:00:00
3,3,fact cobalt mine mainli byproduct metal copper...,0.013467,-1.0,2017-12-13 05:00:00
4,4,snap nyse snap cramer fan cramer trust doesnt ...,0.031292,1.0,2017-12-15 05:00:00
...,...,...,...,...,...
281,281,2019 tast year got scorcher blaze hot melbourn...,0.012952,1.0,2019-01-28 05:00:00
282,282,aapl topic sever report compani offer iphon li...,0.011818,1.0,2019-01-29 05:00:00
283,283,httpswwwyoutubecomwatchv6enwkf1sqog tag aapl a...,0.009776,1.0,2019-01-30 05:00:00
284,284,compani offer iphon line smartphon ipad line m...,0.009638,1.0,2019-01-31 05:00:00


In [0]:
# create dataframe and extract text and sentiment
df_text = df['text']
df_sentiment = df['sentiment']
df_label_score = df['label']

# Using CountVectorizer to generate ngrams
# generate ngrams for 2,2
cv_ngram = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stop_words, ngram_range=(2,2))
X_count = cv_ngram.fit_transform(df_text)

# TfIdf transform to determine freqency
# add smoothing to avoid zero
tfTransform = TfidfTransformer(smooth_idf=True, use_idf=True)
X_count = tfTransform.fit_transform(X_count).toarray()

# pca dataset for ngrams and sentiment

pca_data = PCA(n_components=3)
pcaComponent = pca_data.fit_transform(X_count)

# combining pca and sentiment
lastXTraining = np.hstack((pcaComponent, np.atleast_2d(df_sentiment).T))

# train and test split
X_train, X_test, y_train, y_test = train_test_split(lastXTraining, df_label_score, test_size=0.3, random_state=23)

# minMax scale the data
minMaxPreprocess = preprocessing.MinMaxScaler()
min_max_x_train = minMaxPreprocess.fit_transform(X_train)
min_max_x_test = minMaxPreprocess.fit_transform(X_test)

In [0]:
# evaluation

# logistic regression model
logistic_regression_model = LogisticRegression(random_state=10)
logistic_regression_model.fit(min_max_x_train,y_train)

# train and test prediction
Y_train_prediction = logistic_regression_model.predict(min_max_x_train)
Y_test_prediction = logistic_regression_model.predict(min_max_x_test)

# print train and test accuracy
trainingAccuracy_Score = accuracy_score(y_train, Y_train_prediction)
print('Accuracy on training data is: {0}'.format(trainingAccuracy_Score))
testAccuracy_Score = accuracy_score(y_test, Y_test_prediction)
print('Accuracy on test data is: {0}'.format(testAccuracy_Score))

# print f-1 score
f1_score_pred = f1_score(y_test, Y_test_prediction)
print('Score for f-1 score: {0}'.format(f1_score_pred))

# print classification report
classify_report = classification_report(y_test, Y_test_prediction)
print('Report for classification: {0}'.format(classify_report))

Training accuracy is 0.535
Testing accuracy is 0.6046511627906976
F1 score is 0.7068965517241379
Classifcation report:               precision    recall  f1-score   support

        -1.0       0.39      0.39      0.39        28
         1.0       0.71      0.71      0.71        58

    accuracy                           0.60        86
   macro avg       0.55      0.55      0.55        86
weighted avg       0.60      0.60      0.60        86



In [0]:
# train with ngrams
df_text = df['text']
df_label_score = df['label']

# Using CountVectorizer to generate ngrams
# generate ngrams for 2,2
cv_ngram = CountVectorizer(max_df=0.8, min_df=0.02, stop_words=stop_words, ngram_range=(2,2))
X_count = cv_ngram.fit_transform(df_text)

# TfIdf transform to determine freqency
# add smoothing to avoid zero
tfTransform = TfidfTransformer(smooth_idf=True, use_idf=True)
X_count = tfTransform.fit_transform(X_count).toarray()

# pca dataset for ngrams and sentiment
pca_data = PCA(n_components=3)
lastXTraining = pca_data.fit_transform(X_count)

# train and test split
X_train, X_test, y_train, y_test = train_test_split(lastXTraining, df_label_score, test_size=0.3, random_state=23)

# minMax scale the data
minMaxPreprocess = preprocessing.MinMaxScaler()
min_max_x_train = minMaxPreprocess.fit_transform(X_train)
min_max_x_test = minMaxPreprocess.fit_transform(X_test)

In [0]:
# evaluation

# logistic regression model
logistic_regression_model = LogisticRegression(random_state=10)
logistic_regression_model.fit(min_max_x_train,y_train)

# train and test prediction
Y_train_prediction = logistic_regression_model.predict(min_max_x_train)
Y_test_prediction = logistic_regression_model.predict(min_max_x_test)

# print train and test accuracy
trainingAccuracy_Score = accuracy_score(y_train, Y_train_prediction)
print('Accuracy on training data is: {0}'.format(trainingAccuracy_Score))
testAccuracy_Score = accuracy_score(y_test, Y_test_prediction)
print('Accuracy on test data is: {0}'.format(testAccuracy_Score))

#print f-1 score
f1_score_pred = f1_score(y_test, Y_test_prediction)
print('Score for f-1 score: {0}'.format(f1_score_pred))

#print classification report
classify_report = classification_report(y_test, Y_test_prediction)
print('Report for classification: {0}'.format(classify_report))

Training accuracy is 0.5
Testing accuracy is 0.46511627906976744
F1 score is 0.5
Classifcation report:               precision    recall  f1-score   support

        -1.0       0.33      0.61      0.42        28
         1.0       0.68      0.40      0.50        58

    accuracy                           0.47        86
   macro avg       0.50      0.50      0.46        86
weighted avg       0.56      0.47      0.48        86

