In [1]:
import pandas as pd
import glob

pd.options.display.max_colwidth = 300

In [2]:
def json_tweets_to_pd(jsonfile):
    '''
    read paginated json file to df
    input (string): .json file created by searchtweets module
    output (pd.DataFrame)
    '''
    raw_df = pd.read_json(jsonfile, lines=True)
    df = pd.DataFrame()
    for page in raw_df['data']:
        df = pd.concat([df, pd.DataFrame(page)])
    return df

In [3]:
tweets_files = glob.glob("tweets_data/*.json")

In [4]:
df = pd.concat((json_tweets_to_pd(f) for f in tweets_files))

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 285545 entries, 0 to 10
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          285545 non-null  object
 1   author_id   285545 non-null  object
 2   text        285545 non-null  object
 3   created_at  285545 non-null  object
 4   withheld    2 non-null       object
dtypes: object(5)
memory usage: 13.1+ MB


In [6]:
# drop tweets withheld by the twitter accounts
df = df.drop(df[df.withheld.notnull()].index)
df = df.drop(columns='withheld')

# drop duplicates
df = df.drop_duplicates(subset=['text', 'author_id'])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186204 entries, 0 to 10
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          186204 non-null  object
 1   author_id   186204 non-null  object
 2   text        186204 non-null  object
 3   created_at  186204 non-null  object
dtypes: object(4)
memory usage: 7.1+ MB


In [8]:
df.to_csv('../data/tweets_dow.csv', index=False, quoting=1)