In [None]:
# Mounting google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Remember to "mount drive" first :
# Click on the folder icon on the left, and the third icon from the left is the 
# mount drive function.

import pandas as pd
import datetime as dt
import numpy as np
import os

from sklearn.linear_model import Lasso
# from sklearn.feature_extraction.text import tfidfvectorizer
from sklearn.model_selection import train_test_split

# Defining directories
PATH_DATA      = "/content/drive/MyDrive/Text Mining Project IV Prediction/Data (Raw and processed data from python scripts)/"
RAW_DATA       = PATH_DATA + "Raw Data/"
PROCESSED_DATA = PATH_DATA + "Processed Data/"
COMPLETED_DATA = PATH_DATA + "Completed Data/"

# Importing data sets into dictionary
tweets_dict = {}
HIV_dict = {}

for companycsv in os.listdir(PROCESSED_DATA):
  if companycsv[-7:] == "HIV.csv":
    HIV_dict[companycsv[:-8]] = pd.read_csv(PROCESSED_DATA + companycsv, 
                                            index_col = 0).copy()

    
  else:
    tweets_dict[companycsv[:-11]] = pd.read_csv(PROCESSED_DATA + companycsv, 
                                                index_col = 0).copy()

FileNotFoundError: ignored

In [1]:
COMPLETED_DATA

NameError: ignored

In [None]:
# ============================= CREATING OBJECT ============================== #
# ============================================================================ #
class DATA:
    # tweets and HIV are dictionaries
    def __init__(self, tweets, HIV):

      # Removing time element in tweets only keeping dates
      for company in tweets:
        tweets[company]["post_date"] = tweets[company]["post_date"].apply(
            lambda x: x[:-9])
      
      # Removing data of year 2020 from HIV
      for company in HIV:
        HIV[company] = HIV[company][HIV[company].copy()["Date"].apply(
            lambda x: x[:4] != "2020")]
      # Setting default object features

      self.tweets = tweets
      self.HIV = HIV

    # =========================== Getting self df ============================ #
    # ======================================================================== #

    # Getting access to features
    def get_tweets(self, company):
      try:
        return self.tweets[company].copy()
      except:
        print("Company not found")

    def get_HIV(self, company):
      try:
        return self.HIV[company].copy()
      except:
        print("Company not found")

    def get_companies(self):
      if len(self.tweets.keys()) < len(self.HIV.keys()):
        return list(self.tweets.keys())
      else:
        return list(self.HIV.keys())

    # ========================= Getting processed df ========================= #
    # ======================================================================== #
    def get_tweets_per_day(self, company):
      df = self.tweets["AAPL"].copy()

      tweets_as_series = df.groupby("post_date")["body"].apply(set)
      return pd.DataFrame(tweets_as_series)

    def get_rolling_tweets(self, company, n_days, n_weeks, start_day = dt.date(
        year = 2015, month = 1, day = 9)):
      
      df = self.get_tweets(company).copy()
      df["post_date"] = df["post_date"].apply(pd.to_datetime)

      dates = [start_day + i*dt.timedelta(days = n_days) for i in range(n_weeks+1)]
      dates = pd.DataFrame(dates, columns = ["Dates"]).apply(pd.to_datetime)


      tweets_per_week = []
      for idx in range(0, len(dates)-1):
        # start date of the week and end date of the week
        start = dates.iloc[idx]["Dates"]
        end   = dates.iloc[idx+1]["Dates"]

        # Filtering df
        temp = df[df["post_date"] >= start] # all posts after start date
        temp = temp[temp["post_date"] < end] # all posts before end date
        
        # Creating sets to remove duplicates
        tweets = set(temp["body"])

        tweets_per_week.append([dates.iloc[idx]["Dates"], tweets])

      return pd.DataFrame(tweets_per_week, columns = ["Date", "Tweets"])


    def get_HIV_change(self, company, n, cat = "Implied Vol"):
      df = self.get_HIV(company).copy()
      rolling = df.rolling(n)[cat]

      df[f"{cat} change"] = rolling.apply(lambda x: x.iloc[-1]- x.iloc[0])
      
      return df

    def get_HIV_change_threshold(self, company, n, percentile, cat = "Implied Vol"):
      change = self.get_HIV_change(company, n, cat)[f"{cat} change"].iloc[1:]
      return np.percentile(change.apply(abs), percentile)
    
    def apply_change_threshold(self, company, n, threshold, cat = "Implied Vol"):
      df = self.get_HIV_change(company, n, cat).copy()
      df["Changed"] = df[f"{cat} change"].apply(abs) > threshold
      
      return df


    def get_supervised_data(self, comapny, HIV_threshold, tweets_n_days, 
                            n_weeks, ):
      
      
      HIV = self.apply_change_threshold(company, n_weeks, HIV_threshold).copy()
      rolling_tweets = self.get_rolling_tweets(company, n_days = tweets_n_days, 
                                               n_weeks = 260).copy()

      ML_df = rolling_tweets["Changed"] = list(HIV["Changed"].copy())
      
      return ML_df

In [None]:
# ============================ Defining Functions ============================ #
# ============================================================================ #
def ngrams(string, n):
  splitted = string.split()
  
  res = [ splitted[i : i+n] for i in range(len(splitted) - n + 1) ]

  return res

def vecotriser(lizt):
  vec = tfidfvectorizer()
  res = []
  for ngram in lizt:
    try: 
      res.append(vec.fit_transform(ngram))
    except:
      continue

# transforming 2d list into 1d
def all_in_one(lizt):
  # print(len(lizt))
  res = []
  for i in lizt:
    res = res+i
  return res



In [None]:
# ============================= SCRAP CODE CHUNK ============================= #
# ============================================================================ #


In [None]:
# ============================ Setting parameters ============================ #
# ============================================================================ #

company              = "MSFT"
threshold_percentile = 50 # this term will determine how many percent of weeks
                          # showed change in IV
delta_weeks          = 1

# ============================= Creating Object ============================== #
# ============================================================================ #

# Create object
main = DATA(tweets_dict, HIV_dict)

# Calculating threshold (for considering there is a change)
threshold = main.get_HIV_change_threshold(company, delta_weeks+1, 
                                          threshold_percentile)

# Receiving df for boolean change in IV for given threshold
HIV = main.apply_change_threshold(company, 2, threshold)

# Receiving df for rolling tweets
rolling_tweets = main.get_rolling_tweets(company, n_days = 7, n_weeks = 260)

# Creating a supervised dataset
ML_df = rolling_tweets.copy()
ML_df["Changed"] = list(HIV["Changed"].copy())


In [None]:
# ============================== Data Cleansing ============================== #
# ============================================================================ #

ML_df["Tweets no link"] = ML_df["Tweets"].apply(
    lambda x: [i for i in x if "http" not in i])

ML_df.to_csv(f"{COMPLETED_DATA}MSFT_delta1week_v1.csv")

In [None]:
# ============================ Data Restructuring ============================ #
# ============================================================================ #

res = pd.DataFrame()

tweetz = []
deltaz = []
datez  = []
for i in range(0, len(ML_df)):

  temp_len = len(ML_df.iloc[i]["Tweets no link"])
  tweetz = tweetz+ list(ML_df.iloc[i]["Tweets no link"])

  datez = datez + [ML_df.iloc[i]["Date"]] * temp_len
  if ML_df.iloc[i]["Changed"] == True:
    deltaz = deltaz + [1] * temp_len
  else:
    deltaz = deltaz + [0] * temp_len


res = pd.DataFrame([datez, tweetz, deltaz]).T

res.columns = ["Date", "Tweets", "Changed"]

res.to_csv(f"{COMPLETED_DATA}MSFT_delta1_formated.csv")

In [None]:
f"{COMPLETED_DATA}MSFT_delta1_formated.csv"

# res
# ML_df["Changed"].iloc[1]

'/content/drive/MyDrive/Text Mining Project IV Prediction/Data (Raw and processed data from python scripts)/Completed Data/MSFT_delta1_formated.csv'

**IMPORTANT**
- In the current data processing, numbers and links are excluded

In [None]:
# ============================== Data Cleansing ============================== #
# ============================================================================ #

# Removing tweets with links:
ML_df["Tweets no link"] = ML_df["Tweets"].apply(
    lambda x: [i for i in x if "http" not in i])

# Applying Ngrams:
n = 3

# min number of of tweets
min_n = min(ML_df["Tweets"].apply(len))

ML_df[f"Tweets {n}grams"] = ML_df["Tweets no link"].apply(
    lambda x: [ngrams(i, n) for i in x[:min_n]])


# Reshaping to 1d
ML_df[f"Tweets {n}grams"] = ML_df[f"Tweets {n}grams"].apply(all_in_one)

# ============================ Vectorising Tweets ============================ #
# ============================================================================ #

def vec(lizt):
  vec = vec = CountVectorizer()
  res = []
  for i in lizt:
    try:
      res.append(vec.fit_transform(i))
    except:
      continue
  return res


ML_df[f"Tweets {n}grams"] = ML_df[f"Tweets {n}grams"].apply(
    lambda x: vec(x[:min_n]))


In [None]:
# ===================== Splitting Test and Training Data ===================== #
# ============================================================================ #
# Setting parameters
train_test_ratio = 0.75
train_quantity   = int(train_test_ratio*len(ML_df))


# Randomise order (Randomising order will ensure that iterative ML algos won't 
# pick up wrong trends)
randomised_df = ML_df.sample(frac = 1)

# Splitting train test data.
train = randomised_df.iloc[:train_quantity]
test  = randomised_df.iloc[train_quantity:]

# ============================ Creating ML Model ============================= #
# ============================================================================ #

# setting parameters
alpha = 0.1 # alpha value for elastic net ratio
            # Can experiment with different alpha to test ridge and lasso

# creating lasso regression
lasso = Lasso(alpha = alpha)

# creating elastic net regression
# elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)

train_x, train_y = list(ML_df[f"Tweets {n}grams"].apply(lambda x: x[:200])), list(ML_df["Changed"])



In [None]:
# np.array(train_y)
res = []
for i in train_y:
  if i:
    res.append(1)
  else:
    res.append(0)

res_x = []
for i in train_x:
  res_x.append(np.array(i))


res_x = np.array(res_x)

# lasso.fit(res_x, np.array(res))

lasso.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])

lasso.predict([[1,0]])
# res

array([1.])

In [None]:
from sklearn.linear_model import ElasticNet

In [None]:

def correct_HIV_date_format(string):
  date = string[-2:]
  month = string[-6:-3]
  new_date = string[:5]+date+month

  
date = HIV_AAPL["Date"].iloc[1][-2:]
month = HIV_AAPL["Date"].iloc[1][-6:-3]

new_date = HIV_AAPL["Date"].iloc[1][:5]+date+month
new_date

In [None]:
MSFT = pd.read_csv(f"{COMPLETED_DATA}MSFT.csv")

In [None]:
ML_df

Unnamed: 0,Date,Tweets,Changed,Tweets no link,Tweets 3grams
0,2015-01-09,{How Sony's 'PlayStation Now' Can Shake Up The...,False,[Microsoft has just surpassed Exxon in market ...,"[ (0, 2)\t1\n (1, 0)\t1\n (2, 1)\t1, (0, ..."
1,2015-01-16,"{As an $MSFT Research alum, I can tell you thi...",False,"[$MSFT and $AAPL charts side by side, almost ...","[ (0, 2)\t1\n (1, 1)\t1\n (2, 0)\t1, (0, ..."
2,2015-01-23,{$MSFT - 88% Barchart technical sell signals -...,True,[$MSFT - 88% Barchart technical sell signals -...,"[ (0, 1)\t1\n (2, 0)\t1, (1, 0)\t1\n (2, ..."
3,2015-01-30,{$MSFT A new multi-billion dollar market for M...,True,"[$MSFT just bought Sunrise, a very well design...","[ (0, 2)\t1\n (1, 1)\t1\n (2, 0)\t1, (0, ..."
4,2015-02-06,{Reports say @BillGates gave away another bill...,True,[Reports say @BillGates gave away another bill...,"[ (0, 1)\t1\n (1, 2)\t1\n (2, 0)\t1, (0, ..."
...,...,...,...,...,...
255,2019-11-29,{$NTNX Green and higher than body of ydays c...,False,[$NTNX Green and higher than body of ydays c...,"[ (0, 2)\t1\n (1, 1)\t1\n (2, 0)\t1, (0, ..."
256,2019-12-06,"{$MSFT Thanks for the quick fast , $CLWD Cloud...",False,"[$MSFT Thanks for the quick fast , $MSFT wake ...","[ (0, 1)\t1\n (1, 2)\t1\n (2, 0)\t1, (0, ..."
257,2019-12-13,{$AXSM up 80% Hope U don't want to be CUTE ...,False,[$AXSM up 80% Hope U don't want to be CUTE ...,"[ (0, 1)\t1\n (1, 2)\t1\n (2, 0)\t1, (0, ..."
258,2019-12-20,{These last two up bars in $MSFT look promisin...,False,[These last two up bars in $MSFT look promisin...,"[ (0, 1)\t1\n (1, 0)\t1\n (2, 2)\t1, (0, ..."


In [None]:
ML_df.tweets()