In [1]:
import pandas as pd
import plotly
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [2]:
import textblob
from textblob import TextBlob

In [3]:
df = pd.read_csv('all_tweets.csv')

In [4]:
df.drop('Unnamed: 0', axis = 1, inplace = True)
df = df[~df['tweet_text'].isnull()]

In [5]:
# Remove breaks, punctuation, special characters 
def preprocess(tweet_text):
    ReviewText = tweet_text.str.replace("(<br/>)", "")
    ReviewText = tweet_text.str.replace('(<a).*(>).*(</a>)', '')
    ReviewText = tweet_text.str.replace('(&amp)', '')
    ReviewText = tweet_text.str.replace('(&gt)', '')
    ReviewText = tweet_text.str.replace('(&lt)', '')
    ReviewText = tweet_text.str.replace('(\xa0)', ' ')  
    return tweet_text
df['tweet_text'] = preprocess(df['tweet_text'])

In [6]:
#df['polarity'] = df['tweet_text'].map(lambda text: TextBlob(text).sentiment.polarity)

In [7]:
# Tweet Length
df['tweet_len'] = df['tweet_text'].astype(str).apply(len)

In [8]:
# Number of words in tweet
df['word_count'] = df['tweet_text'].apply(lambda x: len(str(x).split()))

In [9]:
print('5 random tweets with the highest positive sentiment polarity: \n')
cl = df.loc[df.polarity == 1, ['tweet_text']].sample(5).values
for c in cl:
    print(c[0])

5 random tweets with the highest positive sentiment polarity: 

Today in market history, 1706:

Benjamin Franklin, polymath, Founding Father and the face on the $100 bill, is born in Boston, Massachusetts. He is quoting as saying: ""An investment in knowledge always pays the best interest."

image: https://t.co/wFBozLfgBH https://t.co/UnECwxzU8E
RT @JyrkiUurasmaa: Q: At what age did Buffett make his best investment dollarwise?

A: At the age 87 years. With Apple, 40 billion bucks in…
Stocktwits Trending Tickers BMO 12/17/19 
 
$BA Halted 737 MAX production.  
$BBBY Leadership team is being restructured.  
$AMZN Named ‘a best idea for 2020’ at @CowenGroupInc.  
$JBL +10% pre-market, earnings beat BMO. 
$FDX Earnings AMC. 
 
et al.  
 
https://t.co/EjhDPj2zZC
Some of the best chart-posters on here: @hmeisler @teasri @LJKawa @pearkes https://t.co/hfbmnchwKK
$AFMD - Affimed: An Impressive Platform And An Enticing Valuation. https://t.co/SFBQLkLMox #stockmarket #investing #finance


In [10]:
# Distribution of sentiment polarity
df['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')

In [11]:
# Distribution of tweet lengths
df['tweet_len'].iplot(
    kind='hist',
    xTitle='rating',
    linecolor='black',
    yTitle='count',
    title='Tweet Text Length Distribution')

In [12]:
# Distribution of word counts in tweets
df['word_count'].iplot(
    kind='hist',
    xTitle='rating',
    linecolor='black',
    yTitle='count',
    title='Tweet Text Word Count Distribution')

In [13]:
# Distribution of Twitter Handles
df['screen_name'].iplot(
    kind='hist',
    xTitle='rating',
    linecolor='black',
    yTitle='count',
    title='Twitter Handle Count Distribution')

In [14]:
# Count the word frequency in all tweets
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
common_words = get_top_n_words(df['tweet_text'], 20)
for word, freq in common_words:
    print(word, freq)
df1 = pd.DataFrame(common_words, columns = ['tweet_text' , 'count'])
df1.groupby('tweet_text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in tweets')

the 1788
https 1424
co 1420
to 1016
and 881
of 797
in 705
is 623
on 497
for 460
apple 406
it 399
netflix 387
rt 308
this 292
that 279
are 257
with 257
you 245
nflx 245


In [17]:
# Count the word frequency in all tweets, but removed stop words
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(df['tweet_text'], 20)
for word, freq in common_words:
    print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in tweets after removing stop words')

https 1424
apple 406
netflix 387
rt 308
nflx 245
amazon 211
amzn 176
market 172
google 170
aapl 156
facebook 124
amp 120
earnings 118
today 118
year 113
fb 112
time 111
just 109
10 106
new 97


In [18]:
# Count of two words (bigrams) in sequence in tweets
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df['tweet_text'], 20)
for word, freq in common_words:
    print(word, freq)
df3 = pd.DataFrame(common_words, columns = ['tweet_text' , 'count'])
df3.groupby('tweet_text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in tweets')

https co 1413
of the 155
in the 151
for the 70
to the 67
over the 59
and the 53
on the 51
to be 47
is the 46
will be 46
market cap 44
this is 43
at the 42
nflx https 40
the past 39
going to 38
if you 37
amp 500 33
more than 32


In [19]:
# Count of two words (bigrams) in sequence in tweets with stop words removed
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df['tweet_text'], 20)
for word, freq in common_words:
    print(word, freq)
df4 = pd.DataFrame(common_words, columns = ['tweet_text' , 'count'])
df4.groupby('tweet_text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in tweets after removing stop words')


market cap 44
nflx https 40
amp 500 33
rt thestreet 30
aapl https 26
ritholtz https 22
apple microsoft 21
cash flow 19
revenue billions 19
fb https 19
netflix nflx 19
rt trevornoren 19
amzn https 17
abnormalreturns https 16
reads barry 16
billions 2019 15
netflix https 15
today market 15
past year 14
wall street 14


In [20]:
# Count of three words (trigrams) in sequence in tweets
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(df['tweet_text'], 20)
for word, freq in common_words:
    print(word, freq)
df5 = pd.DataFrame(common_words, columns = ['tweet_text' , 'count'])
df5.groupby('tweet_text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in tweets')

nflx https co 39
over the past 26
aapl https co 26
ritholtz https co 22
by ritholtz https 21
fb https co 18
by abnormalreturns https 16
abnormalreturns https co 16
amzn https co 15
one of the 14
the amp 500 14
for the first 14
the first time 14
over the next 14
2004 2003 2002 13
all time high 13
the past year 12
revenue billions 2019 12
new all time 12
netflix https co 12


In [21]:
# Count of three words (trigrams) in sequence in tweets with stop words removed
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(df['tweet_text'], 20)
for word, freq in common_words:
    print(word, freq)
df6 = pd.DataFrame(common_words, columns = ['tweet_text' , 'count'])
df6.groupby('tweet_text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in tweets after removing stop words')

2004 2003 2002 13
revenue billions 2019 12
2003 2002 2001 11
billions 2019 est 11
new time high 10
free cash flow 10
billion market cap 10
disneyplus netflix hbomax 8
netflix hbomax warnermediagrp 8
today market history 8
https viwc0tamhi https 8
revenue billions yoy 7
jimcramer tomkeene squawkcnbc 7
tomkeene squawkcnbc cnbcfastmoney 7
today market moment 7
net income billions 6
2008 2007 2006 6
2007 2006 2005 6
2006 2005 2004 6
2005 2004 2003 6


In [22]:
import nltk

In [23]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gongrh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gongrh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [25]:
blob = TextBlob(str(df['tweet_text']))
pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])
pos_df = pos_df.pos.value_counts()[:20]
pos_df.iplot(
    kind='bar',
    xTitle='POS',
    yTitle='count', 
    title='Top 20 Part-of-speech tagging for tweet corpus')

In [26]:
import plotly.graph_objects

In [27]:
y0 = df.loc[df['screen_name'] == '@charliebilello']['polarity']
y1 = df.loc[df['screen_name'] == '@David_Kretzmann']['polarity']
y2 = df.loc[df['screen_name'] == '@DougKass']['polarity']
y3 = df.loc[df['screen_name'] == '@FinancialTimes']['polarity']
y4 = df.loc[df['screen_name'] == '@GerberKawasaki']['polarity']
y5 = df.loc[df['screen_name'] == '@Hipster_Trader']['polarity']
y6 = df.loc[df['screen_name'] == '@IncomeDisparity']['polarity']
y7 = df.loc[df['screen_name'] == '@Reuters']['polarity']
y8 = df.loc[df['screen_name'] == '@RichLightShed']['polarity']
y9 = df.loc[df['screen_name'] == '@RitholtzWealth']['polarity']
y10 = df.loc[df['screen_name'] == '@Stocktwits']['polarity']
y11 = df.loc[df['screen_name'] == '@TDANetwork']['polarity']

trace0 = plotly.graph_objects.Box(
    y=y0,
    name = '@charliebilello',
    marker = dict(
        color = 'rgb(214, 12, 140)',
    )
)
trace1 = plotly.graph_objects.Box(
    y=y1,
    name = '@David_Kretzmann',
    marker = dict(
        color = 'rgb(0, 128, 128)',
    )
)
trace2 = plotly.graph_objects.Box(
    y=y2,
    name = '@DougKass',
    marker = dict(
        color = 'rgb(10, 140, 208)',
    )
)
trace3 = plotly.graph_objects.Box(
    y=y3,
    name = '@FinancialTimes',
    marker = dict(
        color = 'rgb(12, 102, 14)',
    )
)
trace4 = plotly.graph_objects.Box(
    y=y4,
    name = '@GerberKawasaki',
    marker = dict(
        color = 'rgb(10, 0, 100)',
    )
)
trace5 = plotly.graph_objects.Box(
    y=y5,
    name = '@Hipster_Trader',
    marker = dict(
        color = 'rgb(100, 0, 10)',
    )
)
data = [trace0, trace1, trace2, trace3, trace4, trace5]
layout = plotly.graph_objects.Layout(
    title = "Sentiment Polarity Boxplot of Twitter Handles"
)

fig = plotly.graph_objects.Figure(data=data,layout=layout)
fig

In [28]:
trace6 = plotly.graph_objects.Box(
    y=y6,
    name = '@IncomeDisparity',
    marker = dict(
        color = 'rgb(214, 12, 140)',
    )
)
trace7 = plotly.graph_objects.Box(
    y=y7,
    name = '@Reuters',
    marker = dict(
        color = 'rgb(0, 128, 128)',
    )
)
trace8 = plotly.graph_objects.Box(
    y=y8,
    name = '@RichLightShed',
    marker = dict(
        color = 'rgb(10, 140, 208)',
    )
)
trace9 = plotly.graph_objects.Box(
    y=y9,
    name = '@RitholtzWealth',
    marker = dict(
        color = 'rgb(12, 102, 14)',
    )
)
trace10 = plotly.graph_objects.Box(
    y=y10,
    name = '@Stocktwits',
    marker = dict(
        color = 'rgb(10, 0, 100)',
    )
)
trace11 = plotly.graph_objects.Box(
    y=y11,
    name = '@TDANetwork',
    marker = dict(
        color = 'rgb(100, 0, 10)',
    )
)
data = [trace6, trace7, trace8, trace9, trace10, trace11]
layout = plotly.graph_objects.Layout(
    title = "Sentiment Polarity Boxplot of Twitter Handles"
)

fig2 = plotly.graph_objects.Figure(data=data,layout=layout)
fig2

In [29]:
y0 = df.loc[df['screen_name'] == '@charliebilello']['tweet_len']
y1 = df.loc[df['screen_name'] == '@David_Kretzmann']['tweet_len']
y2 = df.loc[df['screen_name'] == '@DougKass']['tweet_len']
y3 = df.loc[df['screen_name'] == '@FinancialTimes']['tweet_len']
y4 = df.loc[df['screen_name'] == '@GerberKawasaki']['tweet_len']
y5 = df.loc[df['screen_name'] == '@Hipster_Trader']['tweet_len']
y6 = df.loc[df['screen_name'] == '@IncomeDisparity']['tweet_len']
y7 = df.loc[df['screen_name'] == '@Reuters']['tweet_len']
y8 = df.loc[df['screen_name'] == '@RichLightShed']['tweet_len']
y9 = df.loc[df['screen_name'] == '@RitholtzWealth']['tweet_len']
y10 = df.loc[df['screen_name'] == '@Stocktwits']['tweet_len']
y11 = df.loc[df['screen_name'] == '@TDANetwork']['tweet_len']

In [30]:
trace0 = plotly.graph_objects.Box(
    y=y0,
    name = '@charliebilello',
    marker = dict(
        color = 'rgb(214, 12, 140)',
    )
)
trace1 = plotly.graph_objects.Box(
    y=y1,
    name = '@David_Kretzmann',
    marker = dict(
        color = 'rgb(0, 128, 128)',
    )
)
trace2 = plotly.graph_objects.Box(
    y=y2,
    name = '@DougKass',
    marker = dict(
        color = 'rgb(10, 140, 208)',
    )
)
trace3 = plotly.graph_objects.Box(
    y=y3,
    name = '@FinancialTimes',
    marker = dict(
        color = 'rgb(12, 102, 14)',
    )
)
trace4 = plotly.graph_objects.Box(
    y=y4,
    name = '@GerberKawasaki',
    marker = dict(
        color = 'rgb(10, 0, 100)',
    )
)
trace5 = plotly.graph_objects.Box(
    y=y5,
    name = '@Hipster_Trader',
    marker = dict(
        color = 'rgb(100, 0, 10)',
    )
)
data = [trace0, trace1, trace2, trace3, trace4, trace5]
layout = plotly.graph_objects.Layout(
    title = "Tweet Length Boxplot of Twitter Handles"
)

fig3 = plotly.graph_objects.Figure(data=data,layout=layout)
fig3

In [31]:
trace6 = plotly.graph_objects.Box(
    y=y6,
    name = '@IncomeDisparity',
    marker = dict(
        color = 'rgb(214, 12, 140)',
    )
)
trace7 = plotly.graph_objects.Box(
    y=y7,
    name = '@Reuters',
    marker = dict(
        color = 'rgb(0, 128, 128)',
    )
)
trace8 = plotly.graph_objects.Box(
    y=y8,
    name = '@RichLightShed',
    marker = dict(
        color = 'rgb(10, 140, 208)',
    )
)
trace9 = plotly.graph_objects.Box(
    y=y9,
    name = '@RitholtzWealth',
    marker = dict(
        color = 'rgb(12, 102, 14)',
    )
)
trace10 = plotly.graph_objects.Box(
    y=y10,
    name = '@Stocktwits',
    marker = dict(
        color = 'rgb(10, 0, 100)',
    )
)
trace11 = plotly.graph_objects.Box(
    y=y11,
    name = '@TDANetwork',
    marker = dict(
        color = 'rgb(100, 0, 10)',
    )
)
data = [trace6, trace7, trace8, trace9, trace10, trace11]
layout = plotly.graph_objects.Layout(
    title = "Tweet Length Boxplot of Twitter Handles"
)

fig4 = plotly.graph_objects.Figure(data=data,layout=layout)
fig4