In [1]:
import nltk
import warnings
warnings.filterwarnings('ignore')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
import csv
import pandas as pd

# stock market lexicon
stock_lex = pd.read_csv('stock_lex.csv')
stock_lex['sentiment'] = (stock_lex['Aff_Score'] + stock_lex['Neg_Score'])/2
stock_lex = dict(zip(stock_lex.Item, stock_lex.sentiment))
stock_lex = {k:v for k,v in stock_lex.items() if len(k.split(' '))==1}
stock_lex_scaled = {}
for k, v in stock_lex.items():
    if v > 0:
        stock_lex_scaled[k] = v / max(stock_lex.values()) * 4
    else:
        stock_lex_scaled[k] = v / min(stock_lex.values()) * -4

# Loughran and McDonald
positive = []
with open('lm_positive.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        positive.append(row[0].strip())
    
negative = []
with open('lm_negative.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        entry = row[0].strip().split(" ")
        if len(entry) > 1:
            negative.extend(entry)
        else:
            negative.append(entry[0])

final_lex = {}
final_lex.update({word:2.0 for word in positive})
final_lex.update({word:-2.0 for word in negative})
final_lex.update(stock_lex_scaled)
final_lex.update(sia.lexicon)
sia.lexicon = final_lex

In [3]:
df1=pd.read_csv('Netflix-DL.csv')

In [4]:
del df1['Unnamed: 0']

In [7]:
df1

Unnamed: 0,date,news
0,2019-10-01,
1,2019-10-02,
2,2019-10-03,"Netflix Has 12 Original Podcasts, in Case You ..."
3,2019-10-04,New on Netflix this week: Peaky Blinders: Seas...
4,2019-10-05,"Disney Blocks Netflix Ads on ABC, Freeform, FX..."
...,...,...
208,2020-04-26,12 Great Netflix Shows Based On True Stories‘G...
209,2020-04-27,Watch After Life Season 2: Now streaming on Ne...
210,2020-04-28,Extraction: is a sequel possible on Netflix?Ne...
211,2020-04-29,


In [7]:
df2=pd.read_csv('Amazon_v2.csv')

In [8]:
del df2['Unnamed: 0']

In [9]:
df2

Unnamed: 0,date,news
0,2011-01-01,
1,2011-01-02,
2,2011-01-03,
3,2011-01-04,
4,2011-01-05,
...,...,...
3403,2020-04-26,Amazon to end 'Unlimited Unpaid Time off' poli...
3404,2020-04-27,One of the Best Amazon Consulting and Marketin...
3405,2020-04-28,3 Reasons Why You Should Not Buy Security Came...
3406,2020-04-29,“A purely political act”: Amazon hits out as p...


In [10]:
df3=pd.read_csv('combined.csv')

In [11]:
del df3['Unnamed: 0']

In [12]:
df3

Unnamed: 0,date,news
0,2011-01-01,"German Interior Minister: ""WikiLeaks is irrita..."
1,2011-01-02,7.1 magnitude earthquake hits Chile.Her father...
2,2011-01-03,Wikileaks releases cable of the July 1990 meet...
3,2011-01-04,"Over 3,000 birds fall dead in AR, over 500 in ..."
4,2011-01-05,Cable from the U.S. Embassy in Tel Aviv says I...
...,...,...
3404,2020-04-26,Public companies took far more small business ...
3405,2020-04-27,Leader of North Carolina lockdown Protest grou...
3406,2020-04-28,An Indiana postal worker was shot to death. Th...
3407,2020-04-29,US economy shrinks at fastest rate since 2008C...


In [13]:
world=df3
company=df2
year=df1

In [14]:
score1=[]
for i in company['news']:
    sc=sia.polarity_scores(str(i))['compound']
    score1.append(sc)

In [15]:
score2=[]
for i in world['news']:
    sc=sia.polarity_scores(str(i))['compound']
    score2.append(sc)

In [16]:
company['score1']=score1
world['score2']=score2

In [17]:
world

Unnamed: 0,date,news,score2
0,2011-01-01,"German Interior Minister: ""WikiLeaks is irrita...",-0.9793
1,2011-01-02,7.1 magnitude earthquake hits Chile.Her father...,-0.8862
2,2011-01-03,Wikileaks releases cable of the July 1990 meet...,0.7987
3,2011-01-04,"Over 3,000 birds fall dead in AR, over 500 in ...",-0.8556
4,2011-01-05,Cable from the U.S. Embassy in Tel Aviv says I...,-0.9796
...,...,...,...
3404,2020-04-26,Public companies took far more small business ...,0.9310
3405,2020-04-27,Leader of North Carolina lockdown Protest grou...,0.9882
3406,2020-04-28,An Indiana postal worker was shot to death. Th...,-0.9821
3407,2020-04-29,US economy shrinks at fastest rate since 2008C...,0.9617


In [18]:
year = year.rename(columns={'Date': 'date'})

In [19]:
from functools import reduce
dfs = [year, company, world]
df_final = reduce(lambda left,right: pd.merge(left,right,on='date'), dfs)

In [20]:
del df_final['news_x']
del df_final['news_y']

In [21]:
df_final

Unnamed: 0,date,Open,High,Low,Close,Adj Close,Volume,score1,score2
0,2011-01-03,181.369995,186.000000,181.210007,184.220001,184.220001,5331400.0,0.0000,0.7987
1,2011-01-04,186.149994,187.699997,183.779999,185.009995,185.009995,5031800.0,0.0000,-0.8556
2,2011-01-05,184.100006,187.449997,184.070007,187.419998,187.419998,3418800.0,0.0000,-0.9796
3,2011-01-06,186.500000,187.410004,185.250000,185.860001,185.860001,3179700.0,0.0000,0.0990
4,2011-01-07,187.880005,188.449997,183.740005,185.490005,185.490005,5221700.0,0.0000,-0.5754
...,...,...,...,...,...,...,...,...,...
3400,2020-04-25,2417.000000,2432.654907,2372.500000,2393.109985,2393.109985,4738700.0,0.6160,-0.9272
3401,2020-04-26,2417.000000,2432.654907,2372.500000,2393.109985,2393.109985,4738700.0,-0.0157,0.9310
3402,2020-04-27,2443.199951,2444.879883,2363.000000,2376.000000,2376.000000,5645600.0,0.9961,0.9882
3403,2020-04-28,2372.100098,2373.500000,2306.000000,2314.080078,2314.080078,5269400.0,-0.3028,-0.9821


In [22]:
data0=pd.read_csv('Amazon.csv')
data1= pd.read_csv('Coronavirus.csv')
# data2=  pd.read_csv('Covid_19.csv')
data3=  pd.read_csv('presidential_election.csv')
data4= pd.read_csv('Lockdown.csv')
data5=  pd.read_csv('Pandemic.csv')
data6=  pd.read_csv('Quarantine.csv')


In [23]:
from functools import reduce
dfs = [df_final,data0,data1,data3,data4,data5,data6]
df_final1 = reduce(lambda left,right: pd.merge(left,right,on='date'), dfs)

In [25]:
df_final1

Unnamed: 0,date,Open,High,Low,Close,Adj Close,Volume,score1,score2,Amazon,Coronavirus,presidential election,Lockdown,Pandemic,Quarantine
0,2011-01-03,181.369995,186.000000,181.210007,184.220001,184.220001,5331400.0,0.0000,0.7987,30.26,0.00,0.47,0.70,1.04,0.31
1,2011-01-04,186.149994,187.699997,183.779999,185.009995,185.009995,5031800.0,0.0000,-0.8556,29.58,0.00,0.58,0.62,1.08,0.80
2,2011-01-05,184.100006,187.449997,184.070007,187.419998,187.419998,3418800.0,0.0000,-0.9796,28.56,0.00,0.60,0.88,1.54,0.78
3,2011-01-06,186.500000,187.410004,185.250000,185.860001,185.860001,3179700.0,0.0000,0.0990,28.90,0.00,0.73,0.60,1.30,1.00
4,2011-01-07,187.880005,188.449997,183.740005,185.490005,185.490005,5221700.0,0.0000,-0.5754,28.56,0.00,0.67,0.48,1.00,0.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3400,2020-04-25,2417.000000,2432.654907,2372.500000,2393.109985,2393.109985,4738700.0,0.6160,-0.9272,88.27,26.00,0.70,12.48,37.52,64.68
3401,2020-04-26,2417.000000,2432.654907,2372.500000,2393.109985,2393.109985,4738700.0,-0.0157,0.9310,92.15,21.45,0.64,13.65,37.52,68.04
3402,2020-04-27,2443.199951,2444.879883,2363.000000,2376.000000,2376.000000,5645600.0,0.9961,0.9882,89.24,22.10,0.88,16.77,39.53,65.52
3403,2020-04-28,2372.100098,2373.500000,2306.000000,2314.080078,2314.080078,5269400.0,-0.3028,-0.9821,86.33,21.45,0.94,15.21,44.22,61.32


In [26]:
df_final=df_final1
a=[]
b=[]
stock=[]
for k in df_final['Amazon']:
    stock.append(k)

    
for j in df_final['score1']:
     a.append(j)

In [27]:

if a[0]==0:
    temp=[]
    i=0
    while a[i]==0:
        temp.append(i)
        i+=1
        

In [28]:
for i in range(1,len(temp)+1):
    u=temp[-i]
    if stock[u+1]>stock[u]:
        change=stock[u+1]-stock[u]
        
        per= (change/stock[u+1])*100
        
        a[u]=a[u+1]-((per/100)*a[u+1])
    
    else:
                     
        change=stock[u]-stock[u+1]
        
        per= (change/stock[u+1])*100
        
        a[u]=a[u+1]+((per/100)*a[u+1] )          

In [29]:
for i in range(len(a)):
    if a[i]==0:
        if stock[i]>stock[i-1]:
            change=stock[i]-stock[i-1]
        
            per= (change/stock[i-1])*100
        
            a[i]=a[i-1]+((per/100)*a[i-1])
        
        else:
            change=stock[i-1]-stock[i]
        
            per= (change/stock[i-1])*100
        
            a[i]=a[i-1]-((per/100)*a[i-1])

In [30]:
df_final['score1']=a


In [31]:
df_final

Unnamed: 0,date,Open,High,Low,Close,Adj Close,Volume,score1,score2,Amazon,Coronavirus,presidential election,Lockdown,Pandemic,Quarantine
0,2011-01-03,181.369995,186.000000,181.210007,184.220001,184.220001,5331400.0,0.787500,0.7987,30.26,0.00,0.47,0.70,1.04,0.31
1,2011-01-04,186.149994,187.699997,183.779999,185.009995,185.009995,5031800.0,0.769803,-0.8556,29.58,0.00,0.58,0.62,1.08,0.80
2,2011-01-05,184.100006,187.449997,184.070007,187.419998,187.419998,3418800.0,0.743258,-0.9796,28.56,0.00,0.60,0.88,1.54,0.78
3,2011-01-06,186.500000,187.410004,185.250000,185.860001,185.860001,3179700.0,0.752107,0.0990,28.90,0.00,0.73,0.60,1.30,1.00
4,2011-01-07,187.880005,188.449997,183.740005,185.490005,185.490005,5221700.0,0.743258,-0.5754,28.56,0.00,0.67,0.48,1.00,0.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3400,2020-04-25,2417.000000,2432.654907,2372.500000,2393.109985,2393.109985,4738700.0,0.616000,-0.9272,88.27,26.00,0.70,12.48,37.52,64.68
3401,2020-04-26,2417.000000,2432.654907,2372.500000,2393.109985,2393.109985,4738700.0,-0.015700,0.9310,92.15,21.45,0.64,13.65,37.52,68.04
3402,2020-04-27,2443.199951,2444.879883,2363.000000,2376.000000,2376.000000,5645600.0,0.996100,0.9882,89.24,22.10,0.88,16.77,39.53,65.52
3403,2020-04-28,2372.100098,2373.500000,2306.000000,2314.080078,2314.080078,5269400.0,-0.302800,-0.9821,86.33,21.45,0.94,15.21,44.22,61.32


In [32]:
df_final=df_final.rename({'score1':'company_news', 'score2':'world_news', 'Amazon':'company_trends'}, axis='columns')


In [33]:
df_final

Unnamed: 0,date,Open,High,Low,Close,Adj Close,Volume,company_news,world_news,company_trends,Coronavirus,presidential election,Lockdown,Pandemic,Quarantine
0,2011-01-03,181.369995,186.000000,181.210007,184.220001,184.220001,5331400.0,0.787500,0.7987,30.26,0.00,0.47,0.70,1.04,0.31
1,2011-01-04,186.149994,187.699997,183.779999,185.009995,185.009995,5031800.0,0.769803,-0.8556,29.58,0.00,0.58,0.62,1.08,0.80
2,2011-01-05,184.100006,187.449997,184.070007,187.419998,187.419998,3418800.0,0.743258,-0.9796,28.56,0.00,0.60,0.88,1.54,0.78
3,2011-01-06,186.500000,187.410004,185.250000,185.860001,185.860001,3179700.0,0.752107,0.0990,28.90,0.00,0.73,0.60,1.30,1.00
4,2011-01-07,187.880005,188.449997,183.740005,185.490005,185.490005,5221700.0,0.743258,-0.5754,28.56,0.00,0.67,0.48,1.00,0.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3400,2020-04-25,2417.000000,2432.654907,2372.500000,2393.109985,2393.109985,4738700.0,0.616000,-0.9272,88.27,26.00,0.70,12.48,37.52,64.68
3401,2020-04-26,2417.000000,2432.654907,2372.500000,2393.109985,2393.109985,4738700.0,-0.015700,0.9310,92.15,21.45,0.64,13.65,37.52,68.04
3402,2020-04-27,2443.199951,2444.879883,2363.000000,2376.000000,2376.000000,5645600.0,0.996100,0.9882,89.24,22.10,0.88,16.77,39.53,65.52
3403,2020-04-28,2372.100098,2373.500000,2306.000000,2314.080078,2314.080078,5269400.0,-0.302800,-0.9821,86.33,21.45,0.94,15.21,44.22,61.32


In [34]:
df_final.insert(0, 'Company ticker', 'AMZN')

In [35]:
df_final

Unnamed: 0,Company ticker,date,Open,High,Low,Close,Adj Close,Volume,company_news,world_news,company_trends,Coronavirus,presidential election,Lockdown,Pandemic,Quarantine
0,AMZN,2011-01-03,181.369995,186.000000,181.210007,184.220001,184.220001,5331400.0,0.787500,0.7987,30.26,0.00,0.47,0.70,1.04,0.31
1,AMZN,2011-01-04,186.149994,187.699997,183.779999,185.009995,185.009995,5031800.0,0.769803,-0.8556,29.58,0.00,0.58,0.62,1.08,0.80
2,AMZN,2011-01-05,184.100006,187.449997,184.070007,187.419998,187.419998,3418800.0,0.743258,-0.9796,28.56,0.00,0.60,0.88,1.54,0.78
3,AMZN,2011-01-06,186.500000,187.410004,185.250000,185.860001,185.860001,3179700.0,0.752107,0.0990,28.90,0.00,0.73,0.60,1.30,1.00
4,AMZN,2011-01-07,187.880005,188.449997,183.740005,185.490005,185.490005,5221700.0,0.743258,-0.5754,28.56,0.00,0.67,0.48,1.00,0.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3400,AMZN,2020-04-25,2417.000000,2432.654907,2372.500000,2393.109985,2393.109985,4738700.0,0.616000,-0.9272,88.27,26.00,0.70,12.48,37.52,64.68
3401,AMZN,2020-04-26,2417.000000,2432.654907,2372.500000,2393.109985,2393.109985,4738700.0,-0.015700,0.9310,92.15,21.45,0.64,13.65,37.52,68.04
3402,AMZN,2020-04-27,2443.199951,2444.879883,2363.000000,2376.000000,2376.000000,5645600.0,0.996100,0.9882,89.24,22.10,0.88,16.77,39.53,65.52
3403,AMZN,2020-04-28,2372.100098,2373.500000,2306.000000,2314.080078,2314.080078,5269400.0,-0.302800,-0.9821,86.33,21.45,0.94,15.21,44.22,61.32


In [36]:
year=[]
month=[]
quarter=[]
for i in df_final['date']:
    year.append(int(i[:4]))
    month.append(int(i[5:7]))
    if int(i[5:7])<4:
        quarter.append(1)
    elif int(i[5:7])<7: 
        quarter.append(2)
    elif int(i[5:7])<10: 
        quarter.append(3)
    else: 
        quarter.append(4)

In [37]:
df_final.insert(2, 'Year', year)
df_final.insert(3, 'Month', month)
df_final.insert(4, 'Quarter', quarter)

In [38]:
df_final.to_csv('final/Amazon.csv',header=True,encoding='utf-8')

In [235]:
# world.to_csv('final/xyz.csv',header=True,encoding='utf-8')

In [39]:
df_final

Unnamed: 0,Company ticker,date,Year,Month,Quarter,Open,High,Low,Close,Adj Close,Volume,company_news,world_news,company_trends,Coronavirus,presidential election,Lockdown,Pandemic,Quarantine
0,AMZN,2011-01-03,2011,1,1,181.369995,186.000000,181.210007,184.220001,184.220001,5331400.0,0.787500,0.7987,30.26,0.00,0.47,0.70,1.04,0.31
1,AMZN,2011-01-04,2011,1,1,186.149994,187.699997,183.779999,185.009995,185.009995,5031800.0,0.769803,-0.8556,29.58,0.00,0.58,0.62,1.08,0.80
2,AMZN,2011-01-05,2011,1,1,184.100006,187.449997,184.070007,187.419998,187.419998,3418800.0,0.743258,-0.9796,28.56,0.00,0.60,0.88,1.54,0.78
3,AMZN,2011-01-06,2011,1,1,186.500000,187.410004,185.250000,185.860001,185.860001,3179700.0,0.752107,0.0990,28.90,0.00,0.73,0.60,1.30,1.00
4,AMZN,2011-01-07,2011,1,1,187.880005,188.449997,183.740005,185.490005,185.490005,5221700.0,0.743258,-0.5754,28.56,0.00,0.67,0.48,1.00,0.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3400,AMZN,2020-04-25,2020,4,2,2417.000000,2432.654907,2372.500000,2393.109985,2393.109985,4738700.0,0.616000,-0.9272,88.27,26.00,0.70,12.48,37.52,64.68
3401,AMZN,2020-04-26,2020,4,2,2417.000000,2432.654907,2372.500000,2393.109985,2393.109985,4738700.0,-0.015700,0.9310,92.15,21.45,0.64,13.65,37.52,68.04
3402,AMZN,2020-04-27,2020,4,2,2443.199951,2444.879883,2363.000000,2376.000000,2376.000000,5645600.0,0.996100,0.9882,89.24,22.10,0.88,16.77,39.53,65.52
3403,AMZN,2020-04-28,2020,4,2,2372.100098,2373.500000,2306.000000,2314.080078,2314.080078,5269400.0,-0.302800,-0.9821,86.33,21.45,0.94,15.21,44.22,61.32


In [237]:
df=pd.read_csv('final/Facebook.csv')

In [238]:
df_final=df_final.rename({'Facebook':'company_trends'}, axis='columns')


In [239]:
df.to_csv('final/Facebook.csv',header=True,encoding='utf-8')