In [15]:
import pandas as pd

file_path = '/Users/ayushyadav/Documents/Projects/POTATO/correct_twitter_202102.tsv'

def read_data_in_chunks(file_path):
    chunk_size = 10000 
    return pd.read_csv(file_path, delimiter='\t', chunksize=chunk_size, parse_dates=['created_at'])

chunks = read_data_in_chunks(file_path)
first_chunk = next(chunks)
print(first_chunk.head())

                    id           event                               ts1  \
0  1358322479136178177  britney_202102  2022-03-01 09:06:52.403595-05:00   
1  1358323016736796677  britney_202102  2022-01-05 08:34:04.477789-05:00   
2  1358322996696465409  britney_202102  2022-03-01 09:06:52.399008-05:00   
3  1358322976769286151  britney_202102  2022-03-01 09:06:52.399205-05:00   
4  1368923802260889606  britney_202102  2022-03-01 07:26:05.505986-05:00   

                                ts2  from_stream  directly_from_stream  \
0  2022-03-01 09:06:52.403595-05:00         True                  True   
1  2022-03-01 09:06:52.398778-05:00         True                  True   
2  2022-03-01 09:06:52.399008-05:00         True                  True   
3  2022-03-01 09:06:52.399205-05:00         True                  True   
4  2022-03-01 07:26:05.505986-05:00         True                  True   

   from_search  directly_from_search  from_quote_search  \
0        False                 False   

In [24]:
import pandas as pd
from collections import defaultdict
import numpy as np

file_path = '/Users/ayushyadav/Documents/Projects/POTATO/correct_twitter_202102.tsv'

def process_data(file_path):
    chunk_size = 10000
    daily_tweet_counts = defaultdict(int)
    unique_users = set()
    daily_likes = defaultdict(list)
    places = set()
    times_of_day = defaultdict(int)
    user_tweet_counts = defaultdict(int)

    for chunk in pd.read_csv(file_path, delimiter='\t', chunksize=chunk_size):
        chunk['created_at'] = pd.to_datetime(chunk['created_at'], errors='coerce', utc=True)
        
        if chunk['created_at'].dtype == 'datetime64[ns, UTC]':
            music_tweets = chunk[chunk['text'].str.contains('music', case=False, na=False)]
            
            if not music_tweets.empty:
                for _, tweet in music_tweets.iterrows():
                    date = tweet['created_at'].date()  # Convert UTC datetime to date
                    daily_tweet_counts[date] += 1
                    unique_users.add(tweet['author_id'])
                    daily_likes[date].append(tweet['like_count'])
                    if pd.notna(tweet['place_id']):
                        places.add(tweet['place_id'])
                    hour = tweet['created_at'].hour
                    times_of_day[hour] += 1
                    user_tweet_counts[tweet['author_id']] += 1

    # Print results
    print('Daily Tweet Counts:', dict(daily_tweet_counts))
    print('Number of Unique Users:', len(unique_users))
    print('Average Likes per Day:', {day: np.mean(likes) for day, likes in daily_likes.items() if likes})
    print('Places IDs from where Tweets Originated:', places)
    print('Tweet Counts by Hour of Day:', dict(times_of_day))
    print('Top User by Tweet Count:', max(user_tweet_counts, key=user_tweet_counts.get))

process_data(file_path)

Daily Tweet Counts: {datetime.date(2021, 2, 7): 1974, datetime.date(2021, 3, 8): 395, datetime.date(2021, 3, 30): 259, datetime.date(2021, 3, 29): 220, datetime.date(2021, 3, 27): 129, datetime.date(2021, 2, 10): 1006, datetime.date(2021, 2, 2): 111, datetime.date(2021, 2, 9): 2270, datetime.date(2021, 2, 3): 67, datetime.date(2021, 3, 18): 230, datetime.date(2021, 2, 11): 630, datetime.date(2021, 2, 6): 416, datetime.date(2021, 2, 8): 2471, datetime.date(2021, 3, 25): 122, datetime.date(2021, 3, 17): 337, datetime.date(2021, 2, 1): 128, datetime.date(2021, 2, 4): 92, datetime.date(2021, 3, 10): 144, datetime.date(2021, 1, 29): 106, datetime.date(2021, 1, 28): 219, datetime.date(2021, 1, 25): 210, datetime.date(2021, 1, 23): 287, datetime.date(2021, 1, 22): 342, datetime.date(2021, 2, 25): 155, datetime.date(2021, 1, 2): 164, datetime.date(2021, 2, 19): 282, datetime.date(2021, 2, 14): 591, datetime.date(2021, 2, 17): 168, datetime.date(2021, 3, 2): 117, datetime.date(2021, 2, 28): 123