# Hashtag Analysis

#### Importing useful libraries

In [1]:
# Import libraries
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Replace file path with the personal one in which the dataset is contained
chunk = pd.read_csv("/Users/eliacannas/Desktop/SentimentAnalysis/venv/tweets_sentimentAnalysis.csv", chunksize=100000,lineterminator='\n', low_memory=False)
dfSent = pd.concat(chunk)
dfSent.info()
dfSent.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2689765 entries, 0 to 2689764
Data columns (total 15 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_name         object 
 1   user_location     object 
 2   user_description  object 
 3   user_created      object 
 4   user_followers    int64  
 5   user_friends      int64  
 6   user_favourites   int64  
 7   user_verified     bool   
 8   date              object 
 9   text              object 
 10  hashtags          object 
 11  source            object 
       object t
 13  polarity          float64
 14  subjectivity      float64
dtypes: bool(1), float64(2), int64(3), object(9)
memory usage: 289.9+ MB


Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet\r,polarity,subjectivity
0,NFT Nomad 💎,,NFT Creator:\nhttps://t.co/gz5OlavMCa,2021-11-24 07:08:31,65,64,111,False,2022-01-11 17:11:49,$ LNR going huge ! ! ! ! Kucoin bitmart listin...,"['Ethereum', 'crypto', 'cryptocurrency', 'nft'...",Twitter for Android,False\r,0.55,0.65
1,Digital Shogun ပωပ 🛡🌐⚔,Rings of Saturn,Non-Binary🏳️‍🌈Metaverse Builder. Web3 Content ...,2021-01-19 10:54:08,4082,221,59613,False,2022-01-11 17:11:49,"Another decent bounce # Bitcoin $ BTC 1w , 1d ...","['Bitcoin', 'HODL']",Twitter for Android,False\r,0.222222,0.518519
2,Conor Okus,Global,Making #Bitcoin more than an investment @spira...,2012-01-16 20:55:11,3771,439,11363,False,2022-01-11 17:11:49,Heads ! I 'll going live Thursday 18:30 UTC @ ...,['Bitcoin'],Twitter Web App,False\r,0.568182,0.4
3,Kyptos.com,,https://t.co/d03RricqLo - Latest News About Cr...,2021-09-20 02:40:53,10330,217,36,False,2022-01-11 17:11:50,Will LUNA Price Bounce Again .382 Fibonacci Re...,"['bitcoin', 'ElSalvador', 'metaverse', 'NFTs',...",IFTTT,False\r,0.0,0.0
4,Dr. Crypto Jones,"Atlanta, GA",I am NOT a financial advisor. I AM a cryptocur...,2021-03-21 01:27:11,19,109,132,False,2022-01-11 17:11:51,"Not much , 2022 good baby ! Bing bong ! # NFT ...","['NFT', 'Ethereum', 'Bitcoin']",Twitter for Android,False\r,0.45,0.4


#### Correcting column types

In [3]:
# Conversion of column types
dfSent = dfSent.convert_dtypes()
dfSent.dtypes

user_name           string[python]
user_location       string[python]
user_description    string[python]
user_created        string[python]
user_followers               Int64
user_friends                 Int64
user_favourites              Int64
user_verified              boolean
date                string[python]
text                string[python]
hashtags            string[python]
source              string[python]
is_retweet\r        string[python]
polarity                   Float64
subjectivity               Float64
dtype: object

#### Sentiment column creation based on polarity

In [4]:
# Assigning positive, negative or neutral value based on polarity, creating new column in dataset 'Sentiment' and removing unnecessary columns
df_sentiment = dfSent.copy()
df_sentiment['Sentiment'] = df_sentiment["polarity"].apply(lambda x: "Positive" if x > 0 else( "Negative" if x < 0  else "Neutral"))
df_sentiment.drop(['user_description','user_created','source','subjectivity'],axis=1,inplace=True)
df_sentiment.head()

Unnamed: 0,user_name,user_location,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,is_retweet\r,polarity,Sentiment
0,NFT Nomad 💎,,65,64,111,False,2022-01-11 17:11:49,$ LNR going huge ! ! ! ! Kucoin bitmart listin...,"['Ethereum', 'crypto', 'cryptocurrency', 'nft'...",False,0.55,Positive
1,Digital Shogun ပωပ 🛡🌐⚔,Rings of Saturn,4082,221,59613,False,2022-01-11 17:11:49,"Another decent bounce # Bitcoin $ BTC 1w , 1d ...","['Bitcoin', 'HODL']",False,0.222222,Positive
2,Conor Okus,Global,3771,439,11363,False,2022-01-11 17:11:49,Heads ! I 'll going live Thursday 18:30 UTC @ ...,['Bitcoin'],False,0.568182,Positive
3,Kyptos.com,,10330,217,36,False,2022-01-11 17:11:50,Will LUNA Price Bounce Again .382 Fibonacci Re...,"['bitcoin', 'ElSalvador', 'metaverse', 'NFTs',...",False,0.0,Neutral
4,Dr. Crypto Jones,"Atlanta, GA",19,109,132,False,2022-01-11 17:11:51,"Not much , 2022 good baby ! Bing bong ! # NFT ...","['NFT', 'Ethereum', 'Bitcoin']",False,0.45,Positive


In [5]:
# String formatting
df_sentiment['date'] = pd.to_datetime(df_sentiment['date'], format='%Y-%m-%d %H:%M:%S').dt.date

#### Filtering study period bitcoin trend

In [6]:
# Filter the DataFrame for dates between March and June 2022
start_date = pd.to_datetime('2022-03-01').date()
end_date = pd.to_datetime('2022-06-30').date()
df_sentiment = df_sentiment.loc[(df_sentiment['date'] >= start_date) & (df_sentiment['date'] <= end_date)].reset_index(drop=True)

#### Search for most influential hashtags

In [74]:
def hashtagInfluent_general(df):
    # Create a list of all hashtags
    all_hashtags = [] # List
    for hashtags_str in df['hashtags']:
        # Remove unwanted characters and split to get list of hashtags
        hashtags = [tag.strip("[]'") for tag in hashtags_str.split(', ')]
        all_hashtags.extend(hashtags)
    
    # Remove any special characters from the hashtags
    all_hashtags = [tag.strip("#") for tag in all_hashtags]
    
    # Count the frequency of hashtags
    hashtags_frequency = pd.Series(all_hashtags).value_counts()
    
    # View the N most influential hashtags (change N to your choice)
    number_hashtags_influent = 51
    hashtags_influent = hashtags_frequency.head(number_hashtags_influent)
    
    # Get an array of the 50 most influential hashtags
    array_hashtags_influent = hashtags_influent.index.tolist()
    
    # Print the most influential hashtags
    print("The most influential hashtags in the dataset:")
    print(array_hashtags_influent)
    return array_hashtags_influent


In [75]:
df_sentiment['hashtags'] = df_sentiment['hashtags'].astype(str)
array = hashtagInfluent_general(df_sentiment) # Save result in array for use next

The most influential hashtags in the dataset:
['BTC', 'Bitcoin', 'bitcoin', 'btc', 'ETH', 'cryptocurrency', 'crypto', 'Crypto', 'NFT', 'Ethereum', 'BNB', 'eth', 'Binance', 'blockchain', 'BSC', 'NFTs', 'DeFi', 'Metaverse', 'nft', 'NFTCommunity', 'ethereum', 'binance', 'Airdrop', 'altcoin', 'ADA', 'SHIB', 'trading', 'nftart', 'Solana', 'LightningNetwork', 'BITCOIN', 'LN', 'boltcoiner', 'CryptoMining', 'CryptoNews', 'XRP', 'USDT', 'Web3', 'Cryptocurrency', 'SeasonalTokens', 'bnb', 'cryptotrading', 'cryptonews', 'cryptocurrencies', 'DOGE', 'coinhuntworld', 'SOL', 'dogecoin', 'AVAX', 'whale', 'money']


#### Hashtags and tweet for the period of interest

In [76]:
# Function to find the most influential hashtags in a given period
def hashtagInfluent_period(df):
    # Create a list of all unique hashtags in the period of interest
    all_hashtags = []
    for hashtags_str in filtered_df['hashtags']:
        # Remove unwanted characters and split to get list of hashtags
        hashtags = [tag.strip("[]'") for tag in hashtags_str.split(', ')]
        all_hashtags.extend(hashtags)
    
    # Remove any special characters from the hashtags
    all_hashtags = [tag.strip("#") for tag in all_hashtags]
    
    # Count the frequency of unique hashtags
    hashtags_frequency = pd.Series(all_hashtags).value_counts()
    
    # Specifies the number of most influential hashtags to include
    number_hashtags_influent = 150
    
    # Get the n most influential hashtags
    hashtags_influent = hashtags_frequency.head(number_hashtags_influent).index.tolist()
    
    # Remove hashtags from the list hashtags_influent_first
    for hashtag in array:
        if hashtag in hashtags_influent:
            hashtags_influent.remove(hashtag)
    
    # Print the n most influential hashtags
    print(f"The {number_hashtags_influent} most influential hashtags in the period of interest:")
    print(hashtags_influent)


In [77]:
# Function to print tweets with certain hashtags
def see_tweet_on_hashtag(df, list_hashtag):
    for hashtag in list_hashtag:
        # Filters the DataFrame for tweets with the specified hashtag and negative sentiment
        tweets_hashtags_negative = df[
            (df['hashtags'].apply(lambda x: hashtag in x)) & 
            (df['Sentiment'] == 'Negative')
        ]

         # Displays the tweet count for the current hashtag
        num_tweets = len(tweets_hashtags_negative)
        print(f"Hashtag: {hashtag}")
        print(f"Number of associated tweets: {num_tweets}\n")
        
        # View the text of tweets for the current hashtag
        print(f"Hashtag: {hashtag}\n")
        for index, tweet in tweets_hashtags_negative.iterrows():
            print(f"Tweet ID: {index}")
            print(f"Text: {tweet['text']}")
            print("\n")
        print("="*50 + "\n")  # Add a dividing line between hashtags

#### Hashtag and tweets first period

In [78]:
# Select the time window of the first period
start_date = pd.to_datetime('2022-05-11').date()
end_date = pd.to_datetime('2022-05-14').date()

# Filters the DataFrame for the specified dates
filtered_df = df_sentiment[(df_sentiment['date'] >= start_date) & (df_sentiment['date'] <= end_date) & (
            df_sentiment['Sentiment'] == 'Negative')]

# Control column type hashtag
filtered_df['hashtags'] = filtered_df['hashtags'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['hashtags'] = filtered_df['hashtags'].astype(str)


#### Most influential hashtags

In [79]:
# Call the function for print influent hashtag
hashtagInfluent_period(filtered_df)

The 150 most influential hashtags in the period of interest:
['Cryptocrash', 'LUNA', 'BitcoinCrash', 'terraluna', 'UST', 'luna', 'DCA', 'Luna', 'TradingSignals', 'Terra', 'Lunacrash', 'HODL', 'Blockchain', 'lunacoin', 'altcoins', 'doge', 'nfts', 'Coinbase', 'news', 'Tether', 'Altcoins', 'Cardano', 'cryptocrash', 'LUNAtics', 'NFA', 'Tesla', 'investing', 'buythedip', 'CryptocurrencyNews', 'bearmarket', 'TerraUSD', 'gold', 'ust', 'coinbase', 'ElonMusk', 'metaverse', 'RsiBitcoin', 'xrp', 'cryptocurrecy', 'LUNAUSDT', 'memecoin', 'SavageBeauty', 'INHLOSO', 'web3', 'elon', 'bitcoinnews', 'Cryptos', 'nftcollector', 'sanji', 'bscgem', 'sanjiinu', 'opensea', 'P2E', 'solana', 'stockmarketcrash', 'cryptogems', 'token', 'bscgemalert', 'stocks', 'next100xgem', 'Bscgems', 'nftcommunity', 'terra', 'shiba', 'art', 'defi', 'StockMarket', 'investment', 'markets', 'PancakeSwap', 'finance', 'inflation', 'Bitcoin2022', 'business', 'Gold', 'FGW', 'FunnyGameWorld', 'BTCUSD', 'Bollinger', 'RSİ', 'forex', 'Doge

#### Tweets containing the hashtags

In [80]:
# List of hashtags of interest
list_hashtag_interest = ['bearmarket', 'stockmarketcrash', 'Bollinger']

In [81]:
# Call the function with the list of hashtags
see_tweet_on_hashtag(filtered_df, list_hashtag_interest)

Hashtag: bearmarket
Number of associated tweets: 70

Hashtag: bearmarket

Tweet ID: 875711
Text: You know # crypto markets fucked 's core coins green # stablecoins # luna # Cryptocrash # cryptocurrencies # buythedip # bearmarket # Bitcoin # BitcoinCrash https : //t.co/q4QxE3XenM


Tweet ID: 879391
Text: Time accept sad truth `` Bear Market '' ! # Bitcoin # bearmarket # cryptocurrency


Tweet ID: 880940
Text: I got heart attack looking portfolio . -25 % past 24 hours , I 50 % loss , general , That 's ... scary lot ? How guys living life # bearmarket ? 😢 # buythedip # bitcoin


Tweet ID: 881844
Text: The market bad . Are buying dip ? # stockmarketcrash # StockMarket # Cryptocrash # cryptotrading # BTC # BitcoinCrash # Bitcoin # Ethereum # ETH # ETHNFTs # NFTs # buythedip # HODL # APE # SOL # Solana # fintech # news # bearmarket # AMC # GME https : //t.co/mF4LO5Wq6E


Tweet ID: 882324
Text: Do n't Worry The Only Chart Matter This One Bitcoin Will Always Be The Winner In Long Term # crypto

#### Hashtag and tweets second period

In [82]:
# Select the time window of the first period
start_date = pd.to_datetime('2022-06-14').date()
end_date = pd.to_datetime('2022-06-17').date()

# Filters the DataFrame for the specified dates
filtered_df = df_sentiment[(df_sentiment['date'] >= start_date) & (df_sentiment['date'] <= end_date) & (
            df_sentiment['Sentiment'] == 'Negative')]

# Control column type hashtag
filtered_df['hashtags'] = filtered_df['hashtags'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['hashtags'] = filtered_df['hashtags'].astype(str)


#### Most influential hashtags

In [83]:
# Call the function for print influent hashtag
hashtagInfluent_period(filtered_df)

The 150 most influential hashtags in the period of interest:
['DroverInu', 'DroverToken', 'Cryptocrash', 'Forex', 'Trading', 'BabyDogeCoin', 'Game', 'METAVERSE', 'Money', 'Cryptos', 'BitcoinCrash', 'EURUSD', 'CryptoMarket', 'bearmarket', 'Luna', 'Btc', 'CryptocurrencyNews', 'TRX', 'metaverse', 'FOMC', 'altcoins', 'Etherum', 'GBPUSD', 'Coinbase', 'Celsius', 'coinbase', 'Frzss', 'investing', 'Blockchain', 'NFTGiveaway', 'HODL', 'inflation', 'nfts', 'xrp', 'memecoin', 'Fed', 'BTCUSD', 'stocks', 'RsiBitcoin', 'defi', 'doge', 'bitcoinnews', 'CRYPTO', 'Cardano', 'cripto', 'news', 'Gamefi', 'LUNA', 'pancakeswap', 'bscgems', 'bschunter', 'birdbro', 'birdbrofamily', 'birdbrocn', 'birdbrobr', 'birdbroarmy', 'birdbrotoken', 'web3', 'birdbropt', 'P2E', 'Altcoins', 'NFTCommmunity', 'NASDAQ', 'zoomout', 'ada', 'BTCs', 'game', 'Gold', 'StockMarket', 'sol', 'cryptocrash', 'BTCUSDT', 'opensea', 'gold', 'stockmarketcrash', 'CelsiusNetwork', 'fintech', 'CyclicalInvesting', 'luna', 'cro', 'forex', 'crofam

#### Tweets containing the hashtags

In [84]:
# List of hashtags of interest
list_hashtag_interest = ['BitcoinCrash','bearmarket','inflation','cryptocrash','stockmarketcrash','Bollinger']

In [85]:
# Call the function with the list of hashtags
see_tweet_on_hashtag(filtered_df, list_hashtag_interest)

Hashtag: BitcoinCrash
Number of associated tweets: 177

Hashtag: BitcoinCrash

Tweet ID: 1206095
Text: Everything comes Cycles ! Lets hope isnt one long ones ! # Bitcoin # BitcoinCrash # BTC https : //t.co/YOgqyNSOWe


Tweet ID: 1206428
Text: Explaining reasons behind crypto crash . # Celsius # BTC # BitcoinCrash # Cryptocrash https : //t.co/nCDWYnvp6w


Tweet ID: 1206733
Text: # Democrats & amp ; # DNC failed leadership crime , food prices , gas prices , time high rents , Stocks crypto plunging , BUT ORANGE MAN GONE ! ! # BidenGasHike Blame Democrats Blame Biden # BidenWorstPresidentEver # Biden # BitcoinCrash # BTC # ETH # NFT https : //t.co/ndd6AYJFSl


Tweet ID: 1206898
Text: Elephants fight , ants crushed . # bitcoin # btc # BitcoinCrash # Crypto # CryptoMarket


Tweet ID: 1207460
Text: MicroStrategy threatened margin call due fall # BTC 👉https : //t.co/VfbSSNZo1v # BitcoinCrash # Binance https : //t.co/TKTmlyDQBl


Tweet ID: 1207597
Text: How Tuesday going ? These today 's closed