In [4]:
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
from keras.preprocessing.sequence import TimeseriesGenerator

In [11]:
df = pd.read_csv("cap4773-final-dataset.csv")

In [37]:
def overnight_tweet(hour):
  if hour in range(5) or hour == 23:
    return 1
  else:
    return 0

In [41]:
df['overnight'] = df['hour_of_day'].apply(lambda x: overnight_tweet(x))

In [42]:
df.head()

Unnamed: 0.1,Unnamed: 0,user_id,post_created,followers,friends,statuses,emotion,hour_of_day,day_of_week,year,month,day,text_length,average_word_length,follower_friend_ratio,emotion_encoded,weekend,midnight,overnight
0,0,60868031,2009-08-26 13:19:08+00:00,1768,532,5720,joy,13,2,2009,8,26,3,4.333333,3.323308,0.0,0,0,0
1,1,60868031,2009-08-26 13:19:21+00:00,1768,532,5720,joy,13,2,2009,8,26,4,5.0,3.323308,0.0,0,0,0
2,2,60868031,2009-08-26 17:16:35+00:00,1768,532,5720,joy,17,2,2009,8,26,4,8.25,3.323308,0.0,0,0,0
3,3,60868031,2009-08-26 19:51:27+00:00,1768,532,5720,joy,19,2,2009,8,26,4,7.5,3.323308,0.0,0,0,0
4,4,60868031,2009-08-27 00:18:38+00:00,1768,532,5720,joy,0,3,2009,8,27,5,6.0,3.323308,0.0,0,1,1


In [30]:
def create_tweet_sequences(df, sequence_length=10):
    # Sort DataFrame by timestamp
    df = df.sort_values(by='post_created')

    sequences = []
    labels = []

    # Get unique users
    users = df['user_id'].unique()

    # Iterate over each user
    for user_id in users:
        user_data = df[df['user_id'] == user_id]

        # Get unique dates for the user
        unique_dates = pd.to_datetime(user_data['post_created']).dt.date.unique()
        unique_dates = unique_dates[:10]  # Consider up to 10 consecutive days

        # Iterate over each consecutive day
        for i in range(len(unique_dates) - sequence_length + 1):
            # Select tweets for the consecutive days
            start_date = unique_dates[i]
            end_date = unique_dates[i + sequence_length - 1]
            consecutive_tweets = user_data[pd.to_datetime(user_data['post_created']).dt.date.between(start_date, end_date)]

            # Check if enough tweets are available for the sequence length
            if len(consecutive_tweets) >= sequence_length:
                # Create sequences
                for j in range(len(consecutive_tweets) - sequence_length + 1):
                    sequence_data = consecutive_tweets.iloc[j : j + sequence_length]
                    sequence_emotions = sequence_data['emotion'].tolist()
                    sequence_timestamps = sequence_data['post_created'].tolist()
                    sequence_times_of_day = sequence_data['hour_of_day'].tolist()
                    sequence_overnight = sequence_data['overnight'].tolist()

                    sequences.append({
                        'user_id': user_id,
                        'emotions': sequence_emotions,
                        'timestamps': sequence_timestamps,
                        'overnight': sequence_overnight,

                    })

                    # Label for the sequence is the emotion of the last tweet
                    labels.append(sequence_emotions[-1])

    return pd.DataFrame(sequences), pd.Series(labels)

In [31]:
sequences_df, labels = create_tweet_sequences(df)

In [32]:
sequences_df.head()

Unnamed: 0,user_id,emotions,timestamps,times_of_day
0,454311273,"[anger, joy, joy, joy, joy, joy, sadness, joy,...","[2012-12-08 20:36:17+00:00, 2012-12-08 20:37:2...","[20, 20, 20, 20, 20, 20, 20, 20, 20, 20]"
1,454311273,"[joy, joy, joy, joy, joy, sadness, joy, joy, s...","[2012-12-08 20:37:21+00:00, 2012-12-08 20:37:2...","[20, 20, 20, 20, 20, 20, 20, 20, 20, 0]"
2,454311273,"[joy, joy, joy, joy, sadness, joy, joy, sadnes...","[2012-12-08 20:37:29+00:00, 2012-12-08 20:40:0...","[20, 20, 20, 20, 20, 20, 20, 20, 0, 0]"
3,454311273,"[joy, joy, joy, sadness, joy, joy, sadness, jo...","[2012-12-08 20:40:00+00:00, 2012-12-08 20:40:5...","[20, 20, 20, 20, 20, 20, 20, 0, 0, 0]"
4,454311273,"[joy, joy, sadness, joy, joy, sadness, joy, jo...","[2012-12-08 20:40:53+00:00, 2012-12-08 20:41:2...","[20, 20, 20, 20, 20, 20, 0, 0, 0, 0]"


In [33]:
labels.head()

0    sadness
1        joy
2        joy
3        joy
4        joy
dtype: object