## Imports

In [None]:
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
from sklearn.model_selection import train_test_split
import re


In [None]:
#Authenticate google drive
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
#Import files
# https://drive.google.com/file/d/1-1NBP1RoEpbKERjssgexoMm7v3F1sMml/view?usp=sharing
# https://drive.google.com/file/d/1-1-NxTeGh9HktAvcYMDLuCe0jJK6Cngb/view?usp=sharing
# id = '1-1-NxTeGh9HktAvcYMDLuCe0jJK6Cngb'
# downloaded = drive.CreateFile({'id':id}) 
# downloaded.GetContentFile('Combined_FAANG_percentage_2.2.csv')  
# df = pd.read_csv('Combined_FAANG_percentage_2.2.csv', sep=',')
# # https://drive.google.com/file/d/1-4wGVlhCObAoAM_DOLL3D4YsJhOb1ZSj/view?usp=sharing
id = '1-4wGVlhCObAoAM_DOLL3D4YsJhOb1ZSj'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('Combined_FAANG_binary_previous.csv')  
df = pd.read_csv('Combined_FAANG_binary_previous.csv')
df = df[df['message'].notna()]
df.head()

Unnamed: 0,symbol,message,datetime,user,message_id,Date,Time,label
0,AAPL,qq next 60min confirm start rally aapl coming ...,2015-12-21 18:37:24,191996.0,47148173.0,2015-12-21,18:37:24,1
1,AAPL,aapl watching gap fill 169 20,2018-11-24 07:02:32,1665234.0,146068732.0,2018-11-24,07:02:32,1
2,AAPL,aapl weekly options gamblers lose,2014-07-22 21:48:13,71738.0,24904954.0,2014-07-22,21:48:13,1
3,AAPL,aapl,2020-01-27 07:07:03,1229493.0,191978042.0,2020-01-27,07:07:03,0
4,AAPL,key levels watch aapl,2014-06-27 15:19:47,106412.0,24190263.0,2014-06-27,15:19:47,1


## Load Data

In [None]:
#taking 1 year data from complete data
df.drop(df[df['Date'] <= '2019-07-20'].index, inplace = True) 
df

Unnamed: 0,symbol,message,datetime,user,message_id,Date,Time,label
3,AAPL,aapl,2020-01-27 07:07:03,1229493.0,191978042.0,2020-01-27,07:07:03,0
8,AAPL,qq became euphoric calls exp week aiming ath f...,2020-05-13 02:13:00,2250451.0,212222428.0,2020-05-13,02:13:00,0
9,AAPL,spy novices like davey day trader lose money s...,2020-06-24 11:12:09,543250.0,222404886.0,2020-06-24,11:12:09,0
10,AAPL,today aapl shows buy signal ta short term tech...,2019-09-11 09:33:43,700679.0,176835918.0,2019-09-11,09:33:43,1
11,AAPL,aapl let melt begin,2020-01-03 16:46:16,741099.0,188910094.0,2020-01-03,16:46:16,0
...,...,...,...,...,...,...,...,...
2566840,NFLX,hope aside nyt putting also cnbc puts nflx take,2019-09-21 21:49:00,2807876.0,178253955.0,2019-09-21,21:49:00,0
2566841,NFLX,amd msft nflx everything beautiful 39 way,2020-03-06 06:06:03,911299.0,198589415.0,2020-03-06,06:06:03,0
2566848,NFLX,nflx full move key support,2020-03-12 16:52:43,677915.0,199933357.0,2020-03-12,16:52:43,0
2566849,NFLX,spy spx nflx nvda virtually volume today absen...,2019-10-14 18:16:28,55818.0,180328889.0,2019-10-14,18:16:28,1


In [None]:
#Use this only when running on percentage file with 2 labels
df.drop(df[df['label']==0].index,inplace=True)
df['label'].replace({-1:0},inplace = True)

In [None]:
df["label"].value_counts()

1    283388
0    230965
Name: label, dtype: int64

In [None]:
df = df.sample(frac=1)

##Train and predict

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.10, random_state=42)

In [None]:
y_train.value_counts()

1    254835
0    208082
Name: label, dtype: int64

In [None]:
y_test.value_counts()

1    28553
0    22883
Name: label, dtype: int64

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

tuned_parameters = {
    'vect__ngram_range': [(1, 2),(2, 2)],
    'clf__C':[0.01,0.1,1],
    'clf__penalty':["l1","l2"]
}

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', LogisticRegression(max_iter=5000, solver='liblinear'))])
score = 'f1_macro'
logmodel_cv=GridSearchCV(text_clf,tuned_parameters,cv=5, scoring=score)
logmodel_cv.fit(x_train, y_train)

print("tuned hpyerparameters :(best parameters) ",logmodel_cv.best_params_)
print("accuracy :",logmodel_cv.best_score_)
print("{:.4f}".format(logmodel_cv.score(x_test, y_test)))

tuned hpyerparameters :(best parameters)  {'clf__C': 0.1, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 2)}
accuracy : 0.5011453876713212
0.5065


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, logmodel_cv.predict(x_test), digits=4))

              precision    recall  f1-score   support

           0     0.5562    0.2265    0.3219     22883
           1     0.5798    0.8552    0.6910     28553

    accuracy                         0.5755     51436
   macro avg     0.5680    0.5408    0.5065     51436
weighted avg     0.5693    0.5755    0.5268     51436



In [None]:
logmodel_cv.best_params_

{'clf__C': 0.1, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 2)}

In [None]:
logmodel_cv.best_estimator_

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('clf',
                 LogisticRegression(C=0.1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=5000,
                                    multi_class='auto', n_jobs=None,
                         