In [None]:
from scipy.stats import kendalltau
from statsmodels.tsa.stattools import grangercausalitytests, adfuller

### State lists

In [None]:
us_states = [
  "Alabama",
  "Alaska",
  "Arizona",
  "Arkansas",
  "California",
  "Colorado",
  "Connecticut",
  "Delaware",
  "Florida",
  "Georgia",
  "Hawaii",
  "Idaho",
  "Illinois",
  "Indiana",
  "Iowa",
  "Kansas",
  "Kentucky",
  "Louisiana",
  "Maine",
  "Maryland",
  "Massachusetts",
  "Michigan",
  "Minnesota",
  "Mississippi",
  "Missouri",
  "Montana",
  "Nebraska",
  "Nevada",
  "New Hampshire",
  "New Jersey",
  "New Mexico",
  "New York",
  "North Carolina",
  "North Dakota",
  "Ohio",
  "Oklahoma",
  "Oregon",
  "Pennsylvania",
  "Rhode Island",
  "South Carolina",
  "South Dakota",
  "Tennessee",
  "Texas",
  "Utah",
  "Vermont",
  "Virginia",
  "Washington",
  "West Virginia",
  "Wisconsin",
  "Wyoming"
]

In [None]:
brazil_states = [
  "Acre",
  "Alagoas",
  "Amazonas",
  "Amapa",
  "Bahia",
  "Ceara",
  "Federal District", # old "Distrito Federal"
  "Espirito Santo",
  "Goias",
  "Maranhão",
  "Minas Gerais",
  "Mato Grosso do Sul",
  "Mato Grosso",
  "Para",
  "Paraiba",
  "Parana",
  "Pernambuco",
  "Piaui", # old "Piaui"
  "Rio de Janeiro",
  "Rio Grande do Norte",
  "Rondonia",
  "Roraima",
  "Rio Grande do Sul",
  "Santa Catarina",
  "Sergipe",
  "São Paulo",
  "Tocantins"]

### Code

In [None]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6


In [None]:
import pandas as pd
from unidecode import unidecode
import numpy as np

In [None]:
df_india_tweets = pd.read_csv('india_states_tweet_count.csv')
df_india_shortage = pd.read_csv('resources_data_used_for_india.csv')

In [None]:
india_subdivisions = ['Haryana',
 'Madhya Pradesh',
 'Lakshadweep',
 'Tamil Nadu',
 'Andaman and Nicobar Islands',
 'Andhra Pradesh',
 'Uttarakhand',
 'Gujarat',
 'Manipur',
 'Himachal Pradesh',
 'Punjab',
 'Karnataka',
 'Jharkhand',
 'Bihar',
 'Dadra and Nagar Haveli',
 'Arunachal Pradesh',
 'Sikkim',
 'Mizoram',
 'Chandigarh',
 'Goa',
 'Assam',
 'Kerala',
 'West Bengal',
 'Maharashtra',
 'Tripura',
 'Daman and Diu',
 'Delhi',
 'Puducherry',
 'Uttar Pradesh',
 'Rajasthan',
 'Nagaland',
 'Jammu and Kashmir',
 'Odisha',
 'Telangana',
 'Chhattisgarh',
 'Meghala']

In [None]:
def get_state_dataseries_merged(df_tweets, df_resources, state_name, start_date, end_date):
    # take the data from one state and within time range
    df_tweets_selected    = df_tweets.loc[
        (df_tweets['state'] == state_name) &
        (df_tweets['date_time'] >= start_date) &
        (df_tweets['date_time'] < end_date)
    ].sort_values('date_time')
    df_resources_selected = df_resources.loc[
        (df_resources['location_name'] == state_name) &
        (df_resources['date'] >= start_date) &
        (df_resources['date'] < end_date)
    ].sort_values('date')

    if len(df_tweets_selected) == 0 or len(df_resources_selected) == 0:
        return None

    # merge the two tables
    df_merged = pd.merge(df_tweets_selected, df_resources_selected,
                         left_on='date_time', right_on='date',
                         how='inner').drop('date_time', axis=1).sort_values('date')

    return df_merged

In [None]:
start_date = '2020-02-01'
end_date   = '2021-04-01'

In [None]:
results_per_states = {}

adf_stats_tweets = []
p_tweets = []
lag_used_tweets = []
adf_stats_beds = []
p_beds = []
lag_used_beds = []
is_adf = []

state_names = []

for subdivision in india_subdivisions:
    df_merged = get_state_dataseries_merged(df_india_tweets, df_india_shortage, subdivision, start_date, end_date)

    if df_merged is None:
        print('no data for: ', subdivision)
        continue

    state_names.append(subdivision)

    # extract the X and Y
    X = df_merged['neg'].to_numpy()
    Y = df_merged['icu_beds_mean'].to_numpy()
    n = 2
    X = np.diff(X, n=n)
    Y = np.diff(Y, n=n)

    # res = model_series_ccm(X, Y, tau=1, E=8, L_min=40)

    result = adfuller(X)
    adf_stats_tweets.append("{:.3e}".format(result[0]))
    p_tweets.append("{:.3e}".format(result[1]))
    lag_used_tweets.append(result[2])

    result2 = adfuller(Y)
    adf_stats_beds.append("{:.3e}".format(result2[0]))
    p_beds.append("{:.3e}".format(result2[1]))
    lag_used_beds.append(result2[2])

    is_adf.append('Y' if result[1] <= 0.05 and result2[1] <= 0.05 else 'N')

    # results_per_states[subdivision] = res

df = pd.DataFrame()
df['Subdivision Name'] = state_names
df['ADF Statistic - Beds'] = adf_stats_beds
df['p value - Beds'] = p_beds
df['Lag used - Beds'] = lag_used_beds
df['ADF Statistic - Tweets'] = adf_stats_tweets
df['p value - Tweets'] = p_tweets
df['Lag used - Tweets'] = lag_used_tweets
df['Reject null hypothesis'] = is_adf
df

no data for:  Lakshadweep
no data for:  Tamil Nadu
no data for:  Andaman and Nicobar Islands
no data for:  Dadra and Nagar Haveli
no data for:  Chandigarh
no data for:  Assam
no data for:  Daman and Diu
no data for:  Puducherry
no data for:  Jammu and Kashmir
no data for:  Meghala


Unnamed: 0,Subdivision Name,ADF Statistic - Beds,p value - Beds,Lag used - Beds,ADF Statistic - Tweets,p value - Tweets,Lag used - Tweets,Reject null hypothesis
0,Haryana,-6.632,5.7e-09,5,-9.455,4.506e-16,18,Y
1,Madhya Pradesh,-3.435,0.009809,5,-10.35,2.514e-18,16,Y
2,Andhra Pradesh,-14.62,3.964e-27,3,-9.709,1.026e-16,18,Y
3,Uttarakhand,-4.632,0.0001126,16,-10.33,2.804e-18,18,Y
4,Gujarat,-2.951,0.03974,9,-10.03,1.601e-17,18,Y
5,Manipur,-9.108,3.469e-15,11,-9.666,1.314e-16,17,Y
6,Himachal Pradesh,-6.107,9.536e-08,14,-10.37,2.284e-18,17,Y
7,Punjab,-12.5,2.81e-23,21,-12.25,9.826999999999999e-23,21,Y
8,Karnataka,-5.145,1.139e-05,9,-12.93,3.68e-24,17,Y
9,Jharkhand,-4.534,0.0001707,6,-8.752,2.823e-14,18,Y


In [None]:
df.to_csv('adfuller_india.csv', index=False)

In [None]:
p = []
lag = []
is_granger = []
state_names = []

for subdivision in india_subdivisions:

    df_merged = get_state_dataseries_merged(df_india_tweets, df_india_shortage, subdivision, start_date, end_date)

    if df_merged is None:
        print('no data for: ', subdivision)
        continue

    state_names.append(subdivision)

    # extract the X and Y
    X = df_merged['neg'].to_numpy()
    Y = df_merged['icu_beds_mean'].to_numpy()

    n = 2
    X = np.diff(X, n=n)
    Y = np.diff(Y, n=n)

    df_granger = pd.DataFrame(data={
        'tweets': X, 'nursing': Y
    })

    result = grangercausalitytests(df_granger[['nursing', 'tweets']], maxlag=10, verbose=False)

    for i in range(1, 11):
      p_val = result[i][0].get('ssr_ftest')[1]
      if p_val <= 0.05:
        p.append("{:.3e}".format(p_val))
        lag.append(i)
        is_granger.append('Y')
        break
      if i == 10:
        p.append("{:.3e}".format(p_val))
        lag.append(i)
        is_granger.append('N')
        print()

df = pd.DataFrame()
df['Subdivision name'] = state_names
df['p value'] = p
df['Number of lags'] = lag
df['Reject null hypothesis'] = is_granger
df



no data for:  Lakshadweep
no data for:  Tamil Nadu
no data for:  Andaman and Nicobar Islands








no data for:  Dadra and Nagar Haveli



no data for:  Chandigarh

no data for:  Assam


no data for:  Daman and Diu

no data for:  Puducherry



no data for:  Jammu and Kashmir



no data for:  Meghala


Unnamed: 0,Subdivision name,p value,Number of lags,Reject null hypothesis
0,Haryana,0.9981,10,N
1,Madhya Pradesh,0.5967,10,N
2,Andhra Pradesh,1.0,10,N
3,Uttarakhand,0.9996,10,N
4,Gujarat,0.8607,10,N
5,Manipur,0.00344,10,Y
6,Himachal Pradesh,0.8277,10,N
7,Punjab,0.9741,10,N
8,Karnataka,0.9979,10,N
9,Jharkhand,0.318,10,N


In [None]:
df.to_csv('granger_india.csv', index=False)


### US

In [None]:
df_us_tweets = pd.read_csv('us_states_tweet_count.csv')
df_us_shortage = pd.read_csv('COVID-19_Reported_Patient_Impact_and_Hospital_Capacity_by_State_Timeseries__RAW_.csv')

In [None]:
df_us_shortage['date'] = df_us_shortage['date'].apply(lambda d: d.replace('/', '-'))

In [None]:
us_states = {
    'AK': 'Alaska',
    'AL': 'Alabama',
    'AR': 'Arkansas',
    'AZ': 'Arizona',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DC': 'District of Columbia',
    'DE': 'Delaware',
    'FL': 'Florida',
    'GA': 'Georgia',
    'HI': 'Hawaii',
    'IA': 'Iowa',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'MA': 'Massachusetts',
    'MD': 'Maryland',
    'ME': 'Maine',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MO': 'Missouri',
    'MS': 'Mississippi',
    'MT': 'Montana',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'NE': 'Nebraska',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NV': 'Nevada',
    'NY': 'New York',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VA': 'Virginia',
    'VT': 'Vermont',
    'WA': 'Washington',
    'WI': 'Wisconsin',
    'WV': 'West Virginia',
    'WY': 'Wyoming'
}

In [None]:
def get_state_dataseries_merged(df_tweets, df_resources, state_code, start_date, end_date):
    # take the data from one state and within time range
    df_tweets_selected    = df_tweets.loc[
        (df_tweets['state'] == us_states[state_code]) &
        (df_tweets['date_time'] >= start_date) &
        (df_tweets['date_time'] < end_date)
    ].sort_values('date_time')
    df_resources_selected = df_resources.loc[
        (df_resources['state'] == state_code) &
        (df_resources['date'] >= start_date) &
        (df_resources['date'] < end_date)
    ].sort_values('date')

    if len(df_tweets_selected) == 0 or len(df_resources_selected) == 0:
        return None

    # merge the two tables
    df_merged = pd.merge(df_tweets_selected, df_resources_selected,
                         left_on='date_time', right_on='date',
                         how='inner').drop('date_time', axis=1).sort_values('date')

    return df_merged

In [None]:
start_date = '2020-02-01'
end_date   = '2021-04-01'

In [None]:
adf_stats_tweets = []
p_tweets = []
lag_used_tweets = []
adf_stats_beds = []
p_beds = []
lag_used_beds = []
is_adf = []

state_names = []

# now look through each state for causation correlation
for subdivision in us_states:

    df_merged = get_state_dataseries_merged(df_us_tweets, df_us_shortage, subdivision, start_date, end_date)

    if df_merged is None:
        print('no data for: ', subdivision)
        continue

    state_names.append(us_states.get(subdivision))

    # extract the X and Y
    X = df_merged['neg'].to_numpy()
    Y = df_merged['inpatient_beds_used_covid'].to_numpy()
    has_nan = np.isnan(Y)
    if has_nan.any():
        not_nan = np.logical_not(has_nan)
        X = X[not_nan]
        Y = Y[not_nan]

    print(X.shape, Y.shape)

    n = 2
    X = np.diff(X, n=n)
    Y = np.diff(Y, n=n)

    # res = model_series_ccm(X, Y, tau=1, E=8, L_min=40)

    result = adfuller(X)
    adf_stats_tweets.append("{:.3e}".format(result[0]))
    p_tweets.append("{:.3e}".format(result[1]))
    lag_used_tweets.append(result[2])

    result2 = adfuller(Y)
    adf_stats_beds.append("{:.3e}".format(result2[0]))
    p_beds.append("{:.3e}".format(result2[1]))
    lag_used_beds.append(result2[2])

    is_adf.append('Y' if result[1] <= 0.05 and result2[1] <= 0.05 else 'N')

    # results_per_states[subdivision] = res

df = pd.DataFrame()
df['Subdivision Name'] = state_names
df['ADF Statistic - Beds'] = adf_stats_beds
df['p value - Beds'] = p_beds
df['Lag used - Beds'] = lag_used_beds
df['ADF Statistic - Tweets'] = adf_stats_tweets
df['p value - Tweets'] = p_tweets
df['Lag used - Tweets'] = lag_used_tweets
df['Reject null hypothesis'] = is_adf
df

(366,) (366,)
(387,) (387,)
(378,) (378,)
(387,) (387,)
(393,) (393,)
(375,) (375,)
(371,) (371,)
no data for:  DC
(362,) (362,)
(377,) (377,)
(388,) (388,)
(417,) (417,)
(388,) (388,)
(376,) (376,)
(400,) (400,)
(417,) (417,)
(417,) (417,)
(388,) (388,)
(388,) (388,)
(362,) (362,)
(388,) (388,)
(388,) (388,)
(388,) (388,)
(417,) (417,)
(395,) (395,)
(399,) (399,)
(416,) (416,)
(417,) (417,)
(388,) (388,)
(388,) (388,)
(362,) (362,)
(388,) (388,)
(375,) (375,)
(417,) (417,)
(375,) (375,)
(388,) (388,)
(388,) (388,)
(394,) (394,)
(388,) (388,)
(386,) (386,)
(387,) (387,)
(364,) (364,)
(373,) (373,)
(417,) (417,)
(369,) (369,)
(387,) (387,)
(370,) (370,)
(390,) (390,)
(379,) (379,)
(394,) (394,)
(388,) (388,)


Unnamed: 0,Subdivision Name,ADF Statistic - Beds,p value - Beds,Lag used - Beds,ADF Statistic - Tweets,p value - Tweets,Lag used - Tweets,Reject null hypothesis
0,Alaska,-10.39,2.089e-18,12,-9.263,1.394e-15,17,Y
1,Alabama,-10.6,6.238e-19,10,-10.27,3.961e-18,17,Y
2,Arkansas,-9.361,7.802e-16,12,-9.23,1.688e-15,17,Y
3,Arizona,-7.786,8.173e-12,17,-9.378,7.082e-16,16,Y
4,California,-12.18,1.381e-22,8,-9.96,2.384e-17,17,Y
5,Colorado,-10.28,3.752e-18,12,-10.34,2.762e-18,15,Y
6,Connecticut,-9.304,1.095e-15,12,-8.903,1.159e-14,17,Y
7,Delaware,-9.201,2.008e-15,13,-9.992,1.981e-17,17,Y
8,Florida,-7.876,4.856e-12,12,-9.682,1.197e-16,14,Y
9,Georgia,-7.588,2.576e-11,16,-9.766,7.34e-17,17,Y


In [None]:
df.to_csv('adfuller_us.csv', index=False)

In [None]:
p = []
lag = []
is_granger = []
state_names = []

for subdivision in us_states:
    df_merged = get_state_dataseries_merged(df_us_tweets, df_us_shortage, subdivision, start_date, end_date)

    if df_merged is None:
        print('no data for: ', subdivision)
        continue

    state_names.append(us_states.get(subdivision))

    # extract the X and Y
    X = df_merged['neg'].to_numpy()
    Y = df_merged['inpatient_beds_used_covid'].to_numpy()
    has_nan = np.isnan(Y)
    if has_nan.any():
        not_nan = np.logical_not(has_nan)
        X = X[not_nan]
        Y = Y[not_nan]

    n = 2
    X = np.diff(X, n=n)
    Y = np.diff(Y, n=n)

    df_granger = pd.DataFrame(data={
        'tweets': X, 'nursing': Y
    })

    result = grangercausalitytests(df_granger[['nursing', 'tweets']], maxlag=10, verbose=False)

    for i in range(1, 11):
      p_val = result[i][0].get('ssr_ftest')[1]
      if p_val <= 0.05:
        p.append(round(p_val, 3))
        lag.append(i)
        is_granger.append('Y')
        break
      if i == 10:
        p.append(round(p_val, 3))
        lag.append(i)
        is_granger.append('N')
        print()

df = pd.DataFrame()
df['Subdivision name'] = state_names
df['p value'] = p
df['Number of lags'] = lag
df['Reject null hypothesis'] = is_granger
df

In [None]:
df.to_csv('granger_us.csv', index=False)

### Brazil

In [None]:
df_brazil_shortage = pd.read_csv('/content/brazil_beds_only.csv')

In [None]:
df_brazil_tweets = pd.read_csv('/content/brazil_states_tweet_count.csv')

In [None]:
brazil_states = [unidecode(s) for s in brazil_states]
df_brazil_tweets['state'] = df_brazil_tweets['state'].apply(lambda x: unidecode(x))
df_brazil_shortage['location_name'] = df_brazil_shortage['location_name'].apply(lambda x: unidecode(x))
df_brazil_shortage['location_name'] = df_brazil_shortage['location_name'].apply(lambda x: 'Federal District' if x == 'Distrito Federal' else x)

In [None]:
def get_state_dataseries_merged(df_tweets, df_resources, state_name, start_date, end_date):
    # take the data from one state and within time range
    df_tweets_selected    = df_tweets.loc[
        (df_tweets['state'] == state_name) &
        (df_tweets['date_time'] >= start_date) &
        (df_tweets['date_time'] < end_date)
    ].sort_values('date_time')
    df_resources_selected = df_resources.loc[
        (df_resources['location_name'] == state_name) &
        (df_resources['date'] >= start_date) &
        (df_resources['date'] < end_date)
    ].sort_values('date')

    if len(df_tweets_selected) == 0 or len(df_resources_selected) == 0:
        return None

    # merge the two tables
    df_merged = pd.merge(df_tweets_selected, df_resources_selected,
                         left_on='date_time', right_on='date',
                         how='inner').drop('date_time', axis=1).sort_values('date')

    return df_merged

In [None]:
for state in brazil_states:
  df_merged = get_state_dataseries_merged(df_brazil_tweets, df_brazil_shortage, "Acre", '2020-02-01', '2021-01-01')

In [None]:
results_per_states = {}

adf_stats_tweets = []
p_tweets = []
lag_used_tweets = []
adf_stats_beds = []
p_beds = []
lag_used_beds = []
is_adf = []

state_names = []


# now look through each state for causation correlation
# for subdivision in brazil_states:
for subdivision in brazil_states:
    state_names.append(subdivision)
    df_merged = get_state_dataseries_merged(df_brazil_tweets, df_brazil_shortage, subdivision, '2020-02-01', '2021-01-01')

    if df_merged is None:
        print('no data for: ', subdivision)
        continue

    # extract the X and Y
    X = df_merged['neg'].to_numpy()
    Y = df_merged['icu_beds_mean'].to_numpy()

    print(X.shape, Y.shape)

    n = 2
    X = np.diff(X, n=n)
    Y = np.diff(Y, n=n)

    # res = model_series_ccm(X, Y, tau=1, E=8, L_min=40)

    result = adfuller(X)
    adf_stats_tweets.append("{:.3e}".format(result[0]))
    p_tweets.append("{:.3e}".format(result[1]))
    lag_used_tweets.append(result[2])

    result2 = adfuller(Y)
    adf_stats_beds.append("{:.3e}".format(result2[0]))
    p_beds.append("{:.3e}".format(result2[1]))
    lag_used_beds.append(result2[2])

    is_adf.append('Y' if result[1] <= 0.05 and result2[1] <= 0.05 else 'N')

    # results_per_states[subdivision] = res

df = pd.DataFrame()
df['Subdivision Name'] = state_names
df['ADF Statistic - Beds'] = adf_stats_beds
df['p value - Beds'] = p_beds
df['Lag used - Beds'] = lag_used_beds
df['ADF Statistic - Tweets'] = adf_stats_tweets
df['p value - Tweets'] = p_tweets
df['Lag used - Tweets'] = lag_used_tweets
df['Reject null hypothesis'] = is_adf
df

(311,) (311,)
(323,) (323,)
(323,) (323,)
(305,) (305,)
(324,) (324,)
(322,) (322,)
(322,) (322,)
(318,) (318,)
(319,) (319,)
(315,) (315,)
(324,) (324,)
(313,) (313,)
(319,) (319,)
(320,) (320,)
(318,) (318,)
(324,) (324,)
(323,) (323,)
(312,) (312,)
(324,) (324,)
(317,) (317,)
(308,) (308,)
(318,) (318,)
(323,) (323,)
(322,) (322,)
(316,) (316,)
(324,) (324,)
(316,) (316,)


Unnamed: 0,Subdivision Name,ADF Statistic - Beds,p value - Beds,Lag used - Beds,ADF Statistic - Tweets,p value - Tweets,Lag used - Tweets,Reject null hypothesis
0,Acre,-9.118,3.264e-15,6,-8.901,1.17e-14,15,Y
1,Alagoas,-7.708,1.289e-11,13,-8.744,2.96e-14,15,Y
2,Amazonas,-5.907,2.697e-07,12,-9.596,1.973e-16,15,Y
3,Amapa,-8.5,1.245e-13,7,-8.966,8.007e-15,16,Y
4,Bahia,-10.17,7.007e-18,7,-9.854,4.413e-17,16,Y
5,Ceara,-7.255,1.744e-10,12,-8.373,2.628e-13,17,Y
6,Federal District,-7.101,4.165e-10,13,-9.502,3.431e-16,14,Y
7,Espirito Santo,-7.007,7.099e-10,12,-10.21,5.716e-18,16,Y
8,Goias,-9.882,3.74e-17,7,-8.576,7.945e-14,15,Y
9,Maranhao,-10.71,3.366e-19,4,-8.812,1.98e-14,15,Y


In [None]:
df.to_csv('adfuller_brazil.csv', index=False)

In [None]:
p = []
lag = []
is_granger = []
state_names = []

for subdivision in brazil_states:
    state_names.append(subdivision)
    df_merged = get_state_dataseries_merged(df_brazil_tweets, df_brazil_shortage, subdivision, '2020-02-01', '2021-01-01')

    if df_merged is None:
        print('no data for: ', subdivision)
        continue

    # extract the X and Y
    X = df_merged['neg'].to_numpy()
    Y = df_merged['icu_beds_mean'].to_numpy()

    n = 2
    X = np.diff(X, n=n)
    Y = np.diff(Y, n=n)

    df_granger = pd.DataFrame(data={
        'tweets': X, 'nursing': Y
    })

    result = grangercausalitytests(df_granger[['nursing', 'tweets']], maxlag=10, verbose=False)

    for i in range(1, 11):
      p_val = result[i][0].get('ssr_ftest')[1]
      if p_val <= 0.05:
        p.append("{:.3e}".format(p_val))
        lag.append(i)
        is_granger.append('Y')
        break
      if i == 10:
        p.append("{:.3e}".format(p_val))
        lag.append(i)
        is_granger.append('N')
        print()

df = pd.DataFrame()
df['Subdivision name'] = state_names
df['p value'] = p
df['Number of lags'] = lag
df['Reject null hypothesis'] = is_granger
df
























Unnamed: 0,Subdivision name,p value,Number of lags,Reject null hypothesis
0,Acre,0.9925,10,N
1,Alagoas,0.8954,10,N
2,Amazonas,0.2812,10,N
3,Amapa,1.0,10,N
4,Bahia,0.9999,10,N
5,Ceara,2.847e-09,9,Y
6,Federal District,0.9902,10,N
7,Espirito Santo,0.5516,10,N
8,Goias,0.5988,10,N
9,Maranhao,0.0406,6,Y


In [None]:
df.to_csv('granger_brazil.csv', index=False)
