# Required Packages

In [None]:
#pip install getFamaFrenchFactors

In [None]:
#pip install yfinance

In [None]:
#pip install eventstudy

In [None]:
#pip install tensorflow

In [None]:
#pip install --force-reinstall transformers

In [None]:
#!pip install gdeltdoc

# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import pandas_datareader.data as pdr
import datetime as dt
import yfinance as yf
import statsmodels.api as sm
import getFamaFrenchFactors as gff
import seaborn as sns
from nltk.stem import *
from nltk.corpus import stopwords
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import re
from datetime import date, timedelta 
from gdeltdoc import GdeltDoc, Filters
import eventstudy as es
from eventstudy import excelExporter

# Collecting the news we are going to use for our Model Training

For training, we are going to use offline collected data, collected from several students and computers

In [2]:
news_df = pd.read_excel('news_short.xlsx')

In [3]:
news_df.head(5)

Unnamed: 0.1,Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry
0,0,https://www.digitaljournal.com/pr/longhash-ven...,,LongHash Ventures and Terraform Labs Join Forc...,20220406T163000Z,https://www.newsfilecorp.com/newsinfo/119481/356,digitaljournal.com,English,United States
1,1,https://www.prnewswire.com/news-releases/terra...,,TERRA . DO TO COMPETE IN FINAL 20 GROUP FOR ED...,20220406T001500Z,,prnewswire.com,English,United States
2,2,https://techcrunch.com/2022/04/06/terras-found...,,Terra founder plans to back its stablecoin wit...,20220406T213000Z,https://techcrunch.com/wp-content/uploads/2022...,techcrunch.com,English,United States
3,3,https://www.business-standard.com/article/comp...,https://wap.business-standard.com/article-amp/...,Crypto platform Leap raises $3 . 2 mn in fundi...,20220406T081500Z,https://bsmedia.business-standard.com/_media/b...,business-standard.com,English,India
4,4,https://www.fool.com/investing/2022/04/06/can-...,,Can THORchain Keep Surging ? | The Motley Fool,20220406T120000Z,https://g.foolcdn.com/editorial/images/673167/...,fool.com,English,United States


In [4]:
input_df = news_df[['title','seendate']]
input_df.head(5)

Unnamed: 0,title,seendate
0,LongHash Ventures and Terraform Labs Join Forc...,20220406T163000Z
1,TERRA . DO TO COMPETE IN FINAL 20 GROUP FOR ED...,20220406T001500Z
2,Terra founder plans to back its stablecoin wit...,20220406T213000Z
3,Crypto platform Leap raises $3 . 2 mn in fundi...,20220406T081500Z
4,Can THORchain Keep Surging ? | The Motley Fool,20220406T120000Z


In [5]:
news_df2 = pd.DataFrame()
news_df2["date_parsed"]=pd.to_datetime(news_df["seendate"])
news_df2["date_format"] =news_df2["date_parsed"].dt.date
len(news_df2["date_format"].unique())

22

In [6]:
news_df2["date_format"].value_counts()

2022-03-22    1267
2022-03-30    1221
2022-03-29    1201
2022-03-23    1193
2022-03-15    1182
2022-03-24    1171
2022-03-21    1159
2022-03-17    1153
2022-03-16    1146
2022-03-25    1135
2022-03-28    1110
2022-03-18    1077
2022-03-14    1075
2022-03-26    1040
2022-03-27     971
2022-03-19     903
2022-03-20     857
2022-03-13     793
2022-03-12     486
2022-04-06     103
2022-03-31      19
2022-04-07       1
Name: date_format, dtype: int64

---
# Fama French Three Factor

The below function is a supporting function useful for calculating the daily returns of a specific ticker, based on a specific range of dates given by a news dataframe.

In [7]:
def calculated_returns(news_df,date_field,ticker):
    """
    news_df: it will include the news dataframe, with the dates of each piece of news.
    date_field: the field including the date within the dataframe
    ticker: the crypto ticket
    """
    #Copying the dataframe:
    df = news_df.copy()
    
    #Importing Daily Values of FF3 from CSV:
    ff3_daily=pd.read_csv("FF3_daily.csv",parse_dates=['date'])
    ff3_daily.rename(columns={"date": 'Date'}, inplace=True)
    ff3_daily.set_index('Date', inplace=True)

    #Converting the date for proper usage:
    df["date_parsed"]=pd.to_datetime(df[date_field])
    
    #Extracting the dates:
    min_date = df["date_parsed"].min()
    max_date = df["date_parsed"].max()
    start = dt.datetime.strftime(min_date, "%Y-%m-%d")
    end = dt.datetime.strftime(max_date, "%Y-%m-%d")
    
    #print("The earliest news is from "+start)
    #print("The latest news is from "+end)
    
    #Length of frame:
    days = dt.datetime.strptime(end, "%Y-%m-%d")-dt.datetime.strptime(start, "%Y-%m-%d")
    interval = days.days + 1
    
    #print("The number of days to analyze is {}.".format(interval))
   
    #The first run will be calculated using data from the day before of the desired calculation date:
    min_date = min_date - dt.timedelta(days=1)
    
    expected_return_array = []
    day_array = []
    real_return_array = []
    
    for i in range(0,interval):
        
        #Setting the time to get datapoints:
        real_start = dt.datetime.strftime(min_date-dt.timedelta(700), "%Y-%m-%d")
        start = dt.datetime.strftime(min_date, "%Y-%m-%d")
        
        #Parsing Ticket Data: 700 hunderd days before the min date:
        stock_data = yf.download(ticker, real_start, start, adjusted=True)
        #Getting the % ratio of the selected ticker
        stock_returns = stock_data['Adj Close'].resample('d').last().pct_change().dropna()
        stock_returns.name = "Day_Rtn"
        #Creating a single table: merging the 700 days window
        ff_data = ff3_daily.merge(stock_returns,on='Date')
        
        #Creating a linear Model to caclulate the Beta Coefficients:
        X = ff_data[['Mkt-RF', 'SMB', 'HML']]
        y = ff_data['Day_Rtn'] - ff_data['RF']
        X = sm.add_constant(X)
        ff_model = sm.OLS(y, X).fit()
        intercept, b1, b2, b3 = ff_model.params
        
        rf = ff_data['RF'].mean()
        market_premium = ff3_daily['Mkt-RF'].mean()
        size_premium = ff3_daily['SMB'].mean()
        value_premium = ff3_daily['HML'].mean()
        
        #Expected daily return:
        expected_daily_return = rf + b1 * market_premium + b2 * size_premium + b3 * value_premium 
        #Getting the real return of the last date:
        real_return = ff_data.iloc[ff_data.shape[0]-1]['Day_Rtn']
        #Updating the date:
        min_date = min_date + dt.timedelta(days=1)
    
        expected_return_array.append(expected_daily_return)
        day_array.append(min_date)
        real_return_array.append(real_return)
        
    return (expected_return_array, real_return_array, day_array)

Function to Model the Target Variable of a dataframe news using Famma French.

In [8]:
def abnoraml_return_calculation(news_df,date_field,ticker):
    """
    news_df: it will include the news dataframe, with the dates of each piece of news.
    date_field: the field including the date within the dataframe
    ticker: the crypto ticket
    """
    #Copying the dataframe:
    df = news_df.copy()
    
    #Converting the date for proper usage:
    df["date_parsed"]=pd.to_datetime(df[date_field])
    df["date_format"] = df["date_parsed"].dt.date
    
    #Extracting the dates:
    min_date = df["date_parsed"].min()
    max_date = df["date_parsed"].max()
    start = dt.datetime.strftime(min_date, "%Y-%m-%d")
    end = dt.datetime.strftime(max_date, "%Y-%m-%d")
    
    print("The earliest news is from "+start)
    print("The latest news is from "+end)
    
    #Length of frame:
    days = dt.datetime.strptime(end, "%Y-%m-%d")-dt.datetime.strptime(start, "%Y-%m-%d")
    interval = days.days + 1
    
    print("The number of days to analyze is {}.".format(interval))


    #######################################
    #Generating the files for event study:
    #######################################
    
    #Reading Famma French Data:
    ff3_daily=pd.read_csv("FF3_daily.csv")
    #Getting Stock data to append calculate the real return:
    real_start = dt.datetime.strftime(min_date-dt.timedelta(100), "%Y-%m-%d")
    real_end = dt.datetime.strftime(min_date+dt.timedelta(100), "%Y-%m-%d")
    stock_data = yf.download(ticker, real_start, real_end, adjusted=True)

    #Storing the data for later use in the event:
    ff3_daily.to_csv("{}_famafrench.csv".format(ticker), index=False, date_format='%Y%m%d')
    
    stock_data[ticker] = stock_data["Adj Close"].pct_change()
    stock_data = stock_data.dropna()
    stock_data.reset_index(level=0, inplace=True)
    stock_data = stock_data[['Date',ticker]].copy()
    stock_data.columns = ['date',ticker]
    stock_data.to_csv("{}_returns.csv".format(ticker), index=False, date_format='%Y-%m-%d')
    
    ##############################################################################################
    #Now we are going to calculate the whole process with the daily and expected returns window:
    ##############################################################################################

    #Event Study files definition:
    es.Single.import_FamaFrench("{}_famafrench.csv".format(ticker))
    es.Single.import_returns("{}_returns.csv".format(ticker))
    
    #Loop definition of dates
    dates_for_eventstudy = df["date_format"].drop_duplicates().sort_values()

    i = 1
    listAR=[]
    dates=[]
    for date in dates_for_eventstudy:
        date = dt.datetime.strftime(date, "%Y-%m-%d")
        #print(date)
        try:
            event = es.Single.FamaFrench_3factor(
                security_ticker = ticker,
                event_date = np.datetime64(date),
                event_window = (-2,+4), 
                estimation_size = 50,
                buffer_size = 30)

            listAR.append(event.AR)
            dates.append(date)
        
        except:
            list_nans = []
            for nans in range(event.event_window_size):
                list_nans.append(np.nan)
            listAR.append(list_nans)
            dates.append(date)
    
    columns_ar=[]
    
    for i in range(event.event_window[0],event.event_window[1]+1):
        columns_ar.append("AR"+str(i))

    df_AR = pd.DataFrame(listAR, columns=columns_ar)
    df_test2 = pd.DataFrame({'date':dates}).join(df_AR)       
    df_test3 = df_test2.dropna()
    df_test3["date"] = pd.to_datetime(df_test3["date"])
    
    #Calculating the expected return for each day of the dates in the training news as well as the real one:
    expected_day, real_day, day_array = calculated_returns(df,date_field,ticker)
    returns_df = pd.DataFrame()
    returns_df['date']=day_array
    returns_df['format_date']=returns_df['date'].dt.date
    returns_df['expected_return']=expected_day
    returns_df['real_return']=real_day
    returns_df['format_date'] = pd.to_datetime(returns_df['format_date'])
    
    df_test4 = pd.merge(df_test3,returns_df, left_on ="date", right_on="format_date").drop(columns=["date_y","format_date"],axis=1)
    
    
    #We are changing this part, as we are using the expcted daily return for each specific date!
    #Instead of using the expected_daily_return as a constant value, we will use the expected return date we have
    #estimated for each specific day.
    #The threshold as agreed with Prof. Manoel G. is @ 6 days, but the ratio of abnormal returns is very high in
    #relation to the 180 days of 6 months. Adjusted value to 30 days.
    
    relevant_matrix_pos = pd.DataFrame()
    relevant_matrix_neg = pd.DataFrame()
    i = 0
    for column_ in df_test4.columns[1:len(df_test4.columns)-2]:
        relevant_matrix_pos[str(i)] = df_test4[column_]>df_test4["expected_return"]*180
        relevant_matrix_neg[str(i)] = df_test4[column_]<-df_test4["expected_return"]*180
        i += 1
        
   
    df_test4["Relevant_pos"] = relevant_matrix_pos[list(relevant_matrix_pos.columns)].any(axis=1).astype(int)
    df_test4["Relevant_neg"] = 1*(relevant_matrix_neg[list(relevant_matrix_neg.columns)].any(axis=1).astype(int))
    
    df_test4['target'] = df_test4["Relevant_pos"] | df_test4["Relevant_neg"]
    
    df_test4['date'] = df_test4['date_x'].dt.date
    df['date'] = df['date_parsed'].dt.date
    
    df_result = pd.merge(df,df_test4, left_on ="date", right_on="date").drop(columns=["date_x"],axis=1)
    
    return df_result
    
      
    

Creating the Bitcoin DataFrame

In [9]:
result_btc = abnoraml_return_calculation(input_df,'seendate','BTC-USD')

The earliest news is from 2022-03-12
The latest news is from 2022-04-07
The number of days to analyze is 27.
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 comp

In [10]:
result_btc.head(5)

Unnamed: 0,title,seendate,date_parsed,date_format,date,AR-2,AR-1,AR0,AR1,AR2,AR3,AR4,expected_return,real_return,Relevant_pos,Relevant_neg,target
0,LongHash Ventures and Terraform Labs Join Forc...,20220406T163000Z,2022-04-06 16:30:00+00:00,2022-04-06,2022-04-06,0.032101,-0.003755,-0.04461,0.018444,-0.045935,0.008446,-0.022411,0.000304,-0.022879,0,0,0
1,TERRA . DO TO COMPETE IN FINAL 20 GROUP FOR ED...,20220406T001500Z,2022-04-06 00:15:00+00:00,2022-04-06,2022-04-06,0.032101,-0.003755,-0.04461,0.018444,-0.045935,0.008446,-0.022411,0.000304,-0.022879,0,0,0
2,Terra founder plans to back its stablecoin wit...,20220406T213000Z,2022-04-06 21:30:00+00:00,2022-04-06,2022-04-06,0.032101,-0.003755,-0.04461,0.018444,-0.045935,0.008446,-0.022411,0.000304,-0.022879,0,0,0
3,Crypto platform Leap raises $3 . 2 mn in fundi...,20220406T081500Z,2022-04-06 08:15:00+00:00,2022-04-06,2022-04-06,0.032101,-0.003755,-0.04461,0.018444,-0.045935,0.008446,-0.022411,0.000304,-0.022879,0,0,0
4,Can THORchain Keep Surging ? | The Motley Fool,20220406T120000Z,2022-04-06 12:00:00+00:00,2022-04-06,2022-04-06,0.032101,-0.003755,-0.04461,0.018444,-0.045935,0.008446,-0.022411,0.000304,-0.022879,0,0,0


In [11]:
result_btc['target'].value_counts()

1    10659
0     9604
Name: target, dtype: int64

# Creating the Model - NLP Basic Analysis

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
import re

### Starting with Bitcoin Model 

In [13]:
training_df = result_btc[['title','target']]

In [14]:
training_df['target'].value_counts()

1    10659
0     9604
Name: target, dtype: int64

In [15]:
training_df['title'] = training_df['title'].map(lambda x: x.lower() if isinstance(x,str) else '')
training_df['title'] = training_df['title'].map(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df['title'] = training_df['title'].map(lambda x: x.lower() if isinstance(x,str) else '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df['title'] = training_df['title'].map(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x))


In [16]:
training_df.shape

(20263, 2)

In [17]:
training_df.dropna(subset = ["title"], inplace=True) 
training_df.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


(20263, 2)

In [18]:
nltk.download('stopwords')
english_stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to C:\Users\Nicolas
[nltk_data]     Ponte\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

In [20]:
training_df['title'] = remove_stop_words(training_df['title'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df['title'] = remove_stop_words(training_df['title'])


In [21]:
#Applying Lemmatization:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Nicolas
[nltk_data]     Ponte\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Nicolas
[nltk_data]     Ponte\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
#We need to use the final_df['titles'] to  lemmatize each word:
lemmatizer = WordNetLemmatizer()

In [23]:
#Function to apply for each word the proper lemmatization.
def lemmetize_titles(words):
    a = []
    tokens = word_tokenize(words)
    for token in tokens:
        lemmetized_word = lemmatizer.lemmatize(token)
        a.append(lemmetized_word)
    lemmatized_title = ' '.join(a)
    return lemmatized_title 

In [24]:
training_df['lemmetized_titles'] = training_df['title'].apply(lemmetize_titles)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df['lemmetized_titles'] = training_df['title'].apply(lemmetize_titles)


In [25]:
#Chceking whether lemmatization has been applied:
training_df.head(5)

Unnamed: 0,title,target,lemmetized_titles
0,longhash ventures terraform labs join forces a...,0,longhash venture terraform lab join force adva...
1,terra compete final 20 group edtech competitio...,0,terra compete final 20 group edtech competitio...
2,terra founder plans back stablecoin basket cry...,0,terra founder plan back stablecoin basket cryp...
3,crypto platform leap raises 3 2 mn funding coi...,0,crypto platform leap raise 3 2 mn funding coin...
4,thorchain keep surging motley fool,0,thorchain keep surging motley fool


In [26]:
X_train, X_test, y_train, y_test = train_test_split(training_df['lemmetized_titles'],training_df['target'], stratify=training_df['target'],test_size=0.3)
X_train.head(4)

17928                              next biden crypto order
15767    rajya sabha live update member rise matter pub...
12870    fintech firm ripple commits 1b xrp grant progr...
2254                        nielsen deal back buyout basic
Name: lemmetized_titles, dtype: object

In [27]:
# Building a Naive Bayes Classifier
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [28]:
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
score_pred = model.predict_proba(X_test)[:,1]

In [29]:
accuracy_score(y_test, y_pred)

0.6369468662608981

In [30]:
confusion_matrix(y_test, y_pred)

array([[1522, 1359],
       [ 848, 2350]], dtype=int64)

In [None]:
#Let's store the model for later use:
import pickle
pickle.dump(model, open('BTCStrengthScoreModel.sav', 'wb'))

## Applying the Model to the Long Dataset

In [31]:
manoel_df = pd.read_csv('Final_Score.csv')

In [None]:
manoel_df.drop(['Unnamed: 0'],inplace=True,axis=1)

In [32]:
manoel_df.head(5)

Unnamed: 0.1,Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry,lemmetized_titles,relevance_probability,relevance_class,sentiment_negative_probability,sentiment_neutral_probability,sentiment_positive_probability,sentiment_class
0,0,https://www.digitaljournal.com/pr/longhash-ven...,,LongHash Ventures and Terraform Labs Join Forc...,20220406T163000Z,,digitaljournal.com,English,United States,longhash venture terraform lab join force adva...,0.608537,1.0,0.002786,0.154825,0.842389,1
1,1,https://www.prnewswire.com/news-releases/terra...,,TERRA . DO TO COMPETE IN FINAL 20 GROUP FOR ED...,20220406T001500Z,,prnewswire.com,English,United States,terra compete final 20 group edtech competitio...,0.289958,0.0,0.013127,0.830867,0.156006,0
2,2,https://www.fool.com/investing/2022/04/06/can-...,,Can THORchain Keep Surging ? | The Motley Fool,20220406T120000Z,,fool.com,English,United States,thorchain keep surging motley fool,0.288098,0.0,0.083463,0.725645,0.190892,0
3,3,https://www.finanznachrichten.de/nachrichten-2...,,Gold Terra Resource Corp : Gold Terra Intersec...,20220406T123000Z,,finanznachrichten.de,English,Germany,gold terra resource corp gold terra intersects...,0.201722,0.0,0.011705,0.641778,0.346517,0
4,4,https://economictimes.indiatimes.com/tech/tech...,,Crypto wallet Leap raises $3 . 2 million throu...,20220406T114500Z,,economictimes.indiatimes.com,English,India,crypto wallet leap raise 3 2 million token sale,0.719981,1.0,0.003629,0.023551,0.97282,1


In [33]:
training_df = manoel_df
training_df['title'] = training_df['title'].map(lambda x: x.lower() if isinstance(x,str) else '')
training_df['title'] = training_df['title'].map(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x))
training_df['title'] = remove_stop_words(training_df['title'])
training_df['lemmetized_titles'] = training_df['title'].apply(lemmetize_titles)

In [35]:
y_pred = model.predict(training_df['lemmetized_titles'])
score_pred = model.predict_proba(training_df['lemmetized_titles'])[:,1]

In [40]:
manoel_df['strength_score']=y_pred

In [46]:
manoel_df.shape

(243422, 17)

In [60]:
#Getting the Real Percent per Day
#Parsing Ticket Data: 700 hunderd days before the min date:
stock_data = yf.download('BTC-USD', '2021-03-30', '2022-04-30', adjusted=True)
#Getting the % ratio of the selected ticker
stock_returns = stock_data['Adj Close'].resample('d').last().pct_change().dropna()
stock_returns.name = "Day_Rtn"

[*********************100%***********************]  1 of 1 completed


In [72]:
len(stock_returns.to_list())

396

In [86]:
returns_df = pd.DataFrame()

In [87]:
returns_df['date']=stock_returns.index
returns_df['real_percent_change']=stock_returns.to_list()

In [92]:
returns_df['date'] = pd.to_datetime(returns_df['date'])

In [93]:
returns_df.head(5)

Unnamed: 0,date,real_percent_change
0,2021-03-31,1.9e-05
1,2021-04-01,0.003004
2,2021-04-02,0.004882
3,2021-04-03,-0.029981
4,2021-04-04,0.020045


In [94]:
returns_df.dtypes

date                   datetime64[ns]
real_percent_change           float64
dtype: object

In [77]:
manoel_df['date_format'] = pd.to_datetime(manoel_df['seendate']).dt.date

In [95]:
manoel_df['date_format'] = pd.to_datetime(manoel_df['date_format'])

In [96]:
manoel_df.dtypes

url                                       object
url_mobile                                object
title                                     object
seendate                                  object
socialimage                               object
domain                                    object
language                                  object
sourcecountry                             object
lemmetized_titles                         object
relevance_probability                    float64
relevance_class                          float64
sentiment_negative_probability           float64
sentiment_neutral_probability            float64
sentiment_positive_probability           float64
sentiment_class                            int64
strength_score                             int32
date_format                       datetime64[ns]
dtype: object

In [97]:
manoel_df.head(5)

Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry,lemmetized_titles,relevance_probability,relevance_class,sentiment_negative_probability,sentiment_neutral_probability,sentiment_positive_probability,sentiment_class,strength_score,date_format
0,https://www.digitaljournal.com/pr/longhash-ven...,,longhash ventures terraform labs join forces a...,20220406T163000Z,,digitaljournal.com,English,United States,longhash venture terraform lab join force adva...,0.608537,1.0,0.002786,0.154825,0.842389,1,0,2022-04-06
1,https://www.prnewswire.com/news-releases/terra...,,terra compete final 20 group edtech competitio...,20220406T001500Z,,prnewswire.com,English,United States,terra compete final 20 group edtech competitio...,0.289958,0.0,0.013127,0.830867,0.156006,0,0,2022-04-06
2,https://www.fool.com/investing/2022/04/06/can-...,,thorchain keep surging motley fool,20220406T120000Z,,fool.com,English,United States,thorchain keep surging motley fool,0.288098,0.0,0.083463,0.725645,0.190892,0,1,2022-04-06
3,https://www.finanznachrichten.de/nachrichten-2...,,gold terra resource corp gold terra intersects...,20220406T123000Z,,finanznachrichten.de,English,Germany,gold terra resource corp gold terra intersects...,0.201722,0.0,0.011705,0.641778,0.346517,0,0,2022-04-06
4,https://economictimes.indiatimes.com/tech/tech...,,crypto wallet leap raises 3 2 million token sale,20220406T114500Z,,economictimes.indiatimes.com,English,India,crypto wallet leap raise 3 2 million token sale,0.719981,1.0,0.003629,0.023551,0.97282,1,1,2022-04-06


In [99]:
df_result = pd.merge(manoel_df,returns_df, left_on ="date_format", right_on="date")

In [100]:
df_result

Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry,lemmetized_titles,relevance_probability,relevance_class,sentiment_negative_probability,sentiment_neutral_probability,sentiment_positive_probability,sentiment_class,strength_score,date_format,date,real_percent_change
0,https://www.digitaljournal.com/pr/longhash-ven...,,longhash ventures terraform labs join forces a...,20220406T163000Z,,digitaljournal.com,English,United States,longhash venture terraform lab join force adva...,0.608537,1.0,0.002786,0.154825,0.842389,1,0,2022-04-06,2022-04-06,-0.051568
1,https://www.prnewswire.com/news-releases/terra...,,terra compete final 20 group edtech competitio...,20220406T001500Z,,prnewswire.com,English,United States,terra compete final 20 group edtech competitio...,0.289958,0.0,0.013127,0.830867,0.156006,0,0,2022-04-06,2022-04-06,-0.051568
2,https://www.fool.com/investing/2022/04/06/can-...,,thorchain keep surging motley fool,20220406T120000Z,,fool.com,English,United States,thorchain keep surging motley fool,0.288098,0.0,0.083463,0.725645,0.190892,0,1,2022-04-06,2022-04-06,-0.051568
3,https://www.finanznachrichten.de/nachrichten-2...,,gold terra resource corp gold terra intersects...,20220406T123000Z,,finanznachrichten.de,English,Germany,gold terra resource corp gold terra intersects...,0.201722,0.0,0.011705,0.641778,0.346517,0,0,2022-04-06,2022-04-06,-0.051568
4,https://economictimes.indiatimes.com/tech/tech...,,crypto wallet leap raises 3 2 million token sale,20220406T114500Z,,economictimes.indiatimes.com,English,India,crypto wallet leap raise 3 2 million token sale,0.719981,1.0,0.003629,0.023551,0.972820,1,1,2022-04-06,2022-04-06,-0.051568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243417,https://www.netsdaily.com/2021/4/30/22412950/k...,https://www.netsdaily.com/platform/amp/2021/4/...,kyrie irving 28 points enough nets fall blazer...,20210501T051500Z,https://cdn.vox-cdn.com/thumbor/541ATciGnOVAL8...,netsdaily.com,English,United States,kyrie irving 28 point enough net fall blazer 1...,0.394137,0.0,0.062607,0.777211,0.160183,0,0,2021-05-01,2021-05-01,0.001348
243418,https://www.bloodhorse.com/horse-racing/articl...,,malathaat stays perfect determined ky oaks vic...,20210501T041500Z,https://cdn-images.bloodhorse.com/i/bloodhorse...,bloodhorse.com,English,United States,malathaat stay perfect determined ky oak victory,0.343789,0.0,0.031633,0.546889,0.421478,0,1,2021-05-01,2021-05-01,0.001348
243419,https://www.cbs19.tv/article/news/nation-world...,https://www.cbs19.tv/amp/article/news/nation-w...,olympia dukakis dies oscar winning moonstruck ...,20210501T233000Z,https://media.cbs19.tv/assets/CCT/images/89d23...,cbs19.tv,English,United States,olympia dukakis dy oscar winning moonstruck ac...,0.186455,0.0,0.016121,0.898917,0.084961,0,1,2021-05-01,2021-05-01,0.001348
243420,https://www.testudotimes.com/maryland-terps/20...,https://www.testudotimes.com/platform/amp/mary...,behind strong defense maryland women lacrosse ...,20210501T043000Z,https://cdn.vox-cdn.com/thumbor/IHJXkC2qizVUrJ...,testudotimes.com,English,United States,behind strong defense maryland woman lacrosse ...,0.156146,0.0,0.027981,0.696103,0.275916,0,0,2021-05-01,2021-05-01,0.001348


In [101]:
df_result.drop(['date'],inplace=True,axis=1)

In [102]:
df_result.head(5)

Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry,lemmetized_titles,relevance_probability,relevance_class,sentiment_negative_probability,sentiment_neutral_probability,sentiment_positive_probability,sentiment_class,strength_score,date_format,real_percent_change
0,https://www.digitaljournal.com/pr/longhash-ven...,,longhash ventures terraform labs join forces a...,20220406T163000Z,,digitaljournal.com,English,United States,longhash venture terraform lab join force adva...,0.608537,1.0,0.002786,0.154825,0.842389,1,0,2022-04-06,-0.051568
1,https://www.prnewswire.com/news-releases/terra...,,terra compete final 20 group edtech competitio...,20220406T001500Z,,prnewswire.com,English,United States,terra compete final 20 group edtech competitio...,0.289958,0.0,0.013127,0.830867,0.156006,0,0,2022-04-06,-0.051568
2,https://www.fool.com/investing/2022/04/06/can-...,,thorchain keep surging motley fool,20220406T120000Z,,fool.com,English,United States,thorchain keep surging motley fool,0.288098,0.0,0.083463,0.725645,0.190892,0,1,2022-04-06,-0.051568
3,https://www.finanznachrichten.de/nachrichten-2...,,gold terra resource corp gold terra intersects...,20220406T123000Z,,finanznachrichten.de,English,Germany,gold terra resource corp gold terra intersects...,0.201722,0.0,0.011705,0.641778,0.346517,0,0,2022-04-06,-0.051568
4,https://economictimes.indiatimes.com/tech/tech...,,crypto wallet leap raises 3 2 million token sale,20220406T114500Z,,economictimes.indiatimes.com,English,India,crypto wallet leap raise 3 2 million token sale,0.719981,1.0,0.003629,0.023551,0.97282,1,1,2022-04-06,-0.051568


In [103]:
df_result.to_csv('FinalLongDataset.csv')