## Tweet fetching using Twint

In [None]:
import twint
import pandas as pd
import asyncio

QUERIES = [
    "..."
]
START_DATE = "yyyy-mm-dd"
END_DATE = "yyyy-mm-dd"
PATH = ''

cities = pd.read_csv(f'{PATH}')
cities = cities[['City', 'State']]

for index, row in cities.iterrows():
    city = row.City
    state = row.State
    print(f"--------This is index {index} ---------")
    for query in QUERIES:        
        c = twint.Config()
        c.Since = START_DATE
        c.Until = END_DATE
        c.Location = False
        c.User_full = False
        c.Lang = '...'
        # c.Profile_full = True
        c.Search = query
        c.Near = city
        c.Hide_output = True
        c.Count = False
        c.Store_csv = True
        # Output will be appended
        c.Output = f"{PATH}{state}_tweets.csv"
        twint.run.Search(c)

## Sentiment Analyze with VADER

In [None]:
!pip install nltk
!pip install vaderSentiment

In [None]:
import nltk
nltk.download('punkt')

### Preprocessing

In [None]:
import re
def vader_preprocess(text):
  # Remove Url
  text = re.sub(r"http\S+", "", text)
  # Remove mentions
  text = re.sub(r"@\S+", "", text)
  # Convert `&amp;` to `&`
  text = re.sub(r"&amp;", "&", text)
  text = re.sub(r"\n", "", text)
  return text

### Sentiment

In [None]:
from nltk import tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# Create a SentimentIntensityAnalyzer object.
analyzer = SentimentIntensityAnalyzer()

def vader_sent(text):
  sentence_list = [sent for w in tokenize.sent_tokenize(text) for sent in w.split('\n') if sent]
  paragraphSentiments = 0.0
  for sentence in sentence_list:
      vs = analyzer.polarity_scores(sentence)
      paragraphSentiments += vs["compound"]
  
  # If list is empty
  if len(sentence_list) == 0: 
    return 0.0, "Neutral"
  else:
    # Average sentiment (compound score)
    avg_sent = round(paragraphSentiments / len(sentence_list), 4)
    avg_sent = paragraphSentiments
    if avg_sent >= 0.35 :
        overall_sentiment = "Positive"
    elif avg_sent <= - 0.05 :
        overall_sentiment = "Negative"
    else:
        overall_sentiment = "Neutral"

    return overall_sentiment

### Read data and analyze

In [None]:
import os 

NEGLIGBLE_MIN = 2000

for filename in os.listdir(f"{DIRECTORY}/tweets"):
  f = os.path.join(DIRECTORY, filename)
  if f.endswith('.csv'):
    region = filename[:-11]
    print(region)
    df = pd.read_csv(f"{DIRECTORY}/tweets/{region}_tweets.csv")
    # Remove duplicates
    df = df.drop_duplicates(['tweet'])
    # Neglect negligible regions
    if df.shape[0] < NEGLIGBLE_MIN: continue
    # Extract features
    df = df[['id','tweet','created_at']]
    # Preprocessing
    df['vader_preprocessed'] = df['tweet'].apply(vader_preprocess)
    # df['vader_preprocessed'] =  t5_preprocess(df, 'tweet').apply(lambda x: " ".join(x))
    # Extract sentiment with vader
    df['emotion'] = df['vader_preprocessed'].apply(vader_sent)

    # change `created_at` to datetime
    def temp(x):
      # Trim to get only dates
      x.created_at = x.created_at[:10]
      return x
    # Turn string dtype to datetime
    df.created_at = pd.to_datetime(df.apply(temp, axis=1).created_at)
    # Save into csv
    df[['created_at', 'emotion']].to_csv(f'{DIRECTORY}/sentiments/{region}_tweet_sentiment.csv', index=False)

## Ad-fuller and Granger Test

In [None]:
!pip install scipy
!pip install statsmodels

from scipy.stats import kendalltau
from statsmodels.tsa.stattools import grangercausalitytests, adfuller

def fuller(data_list):
  result = adfuller(data_list)
  print('ADF Statistic: %f' % result[0])
  print('p-value: %f' % result[1])
  print('Critical Values:')
  for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

# Change n to calculate high difference if adfuller tests fail for any series
n = 0
print(adfuller(np.diff(region_count['count'].tolist()[:], n=n)))
print(adfuller(np.diff(beds_list_all[:], n=n)))

df_granger = pd.DataFrame(data={
    'tweets': np.diff(region_count['count'].tolist()[:], n=n),
    'nursing': np.diff(beds_list_all[:], n=n)
})
grangercausalitytests(df_granger[['nursing', 'tweets']], maxlag=4)