This notebook queries Twitter with a simple query and stores results in a json file. 

In [None]:
from twarc import Twarc2, expansions
import datetime
import json
import pandas as pd

In [None]:
bearer_token = open('bearer_token.txt').readlines()[0].strip()

client = Twarc2(bearer_token=bearer_token)

In [None]:
client.auth_type

In [None]:
def create_query(query, lang = False, no_rt = False, no_reply = False, has_geo = False):
    if type(query) == list:
        result = f'point_radius:[{query[0]} {query[1]} 15km]'
    if type(query) == str:
        result = query 
    if lang:
        result += f'lang:{lang}'
    if no_rt:
        result += '-is:retweet'
    if no_reply: 
        result += '-is:reply'
    if has_geo:
        result += 'has:geo'   
    return result 
        

def retrieve_tweets(query, start_time, end_time):
    
    tweets = []
    
    search_results = client.search_all(query=query,
                                       start_time=start_time, 
                                       end_time=end_time, 
                                       max_results=10)

    for page in search_results:

        result = expansions.flatten(page)
        for tweet in result:
            tweets.append(tweet)
    return tweets

In [None]:
def create_df(query):
    tweets = []
    with open(f'tweets/{query}.json','r') as infile:
        for line in infile.readlines():
            tweets += json.loads(line)
    print(len(tweets))

    clmns = ['id', 'author_id', 'created_at', 'lang', 'text', 'possibly_sensitive']
    records = []
    for tweet in tweets:
        record = [tweet['author']['username']]
        for clmn in clmns:
            record.append(tweet[clmn])
        if 'location' in tweet['author']:
            record.append(tweet['author']['location'])
        else:
            record.append('')
        records.append(record)

    df = pd.DataFrame.from_records(records, columns = ['username'] + clmns + ['user_location'])
    df.to_csv(f'tweets/{query}.csv')

In [None]:
start_time = datetime.datetime(2022, 1, 1, 0, 0, 0, 0, datetime.timezone.utc)
end_time = datetime.datetime(2022, 1, 30, 0, 0, 0, 0, datetime.timezone.utc)

In [None]:
seed = ''

query = create_query(seed, no_rt = True)

tweets = retrieve_tweets(query, start_time, end_time)

with open(f'tweets/{query}.json','w+') as outfile:
    json.dump(tweets, outfile)
create_df(query)

In [None]:
lat = 48.13
long = 11.57

query = create_query([lat, long])

tweets = retrieve_tweets(query, start_time, end_time)

with open(f'tweets/{query}.json','w+') as outfile:
    json.dump(tweets, outfile)


In [None]:
def clean(s):
    import re
    s = re.sub('@[\w]+','',s)
    s = re.sub(r'https?://\S+|www\.\S+', '', s)
    #s = s.replace('RT :','')
    s = s.strip()
    return s


df['cleaned_text'] = df.text.apply(clean)