In [57]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [58]:
#https://textblob.readthedocs.io/en/dev/install.html
from textblob import TextBlob
import pandas as pd
import numpy as np
import collections
import datetime
import calendar
import math
import time
import re
import os

In [59]:
def clean_txt(text):
    text = re.sub(r"(?:\@)\S+", "", text) #Removing @mentions
    text = re.sub('#', '', text) #Removing '#' hash tag
    text = re.sub('RT[\s]+', '', text) #Removing RT
    text = re.sub('https?:\/\/\S+', '', text) #Removing hyperlink
    text = text.strip() #Removing leading and trailing spaces 
    text = text.lower() #Make tweet text lowercase
    return text

In [60]:
def analyze_sentiment(row, tol):
    tb = TextBlob(row.tweet)
    row['polarity'] = tb.sentiment.polarity
    row['subjectivity'] = tb.sentiment.subjectivity
    if abs(row['polarity']) <= tol:
        row['tone'] = 'Neutral'	    
    elif row['polarity'] < 0: 
        row['tone'] = 'Negative'
    else:
        row['tone'] = 'Positive'
    return row

In [61]:
def aggregate(df, symbol, granularity, features=None, tone_tolerance=0, one_hot_encode=False, **get_dummies_kwargs):
    """
    Parameter(s):
    -------------
    df : DataFrame
        A dataframe of tweet data. The dataframe is assumed to be generated by twint.
    symbol : string
        String representing the company's stock symbol (Ex: 'AAPL').
    granularity : string
        Pandas date offset frequency string. 'H' for by the hour, "T" for by the minute, "S" for by the second.
    features : array of strings or None
        String array containing the names of the 21 possible features.
        ['date', 'username', 'tweet', 'replies_count', 'retweets_count',
        'likes_count', 'subjectivity', 'polarity', 'tone', 'num_tweets',
        'replies_sum', 'retweets_sum', 'likes_sum','subjectivity_sum', 
        'polarity_sum', 'replies_avg', 'retweets_avg', 'likes_avg', 
        'subjectivity_avg', 'polarity_avg', 'tone_most_common']
        If None is passed in (the default), all features are used.
    tone_tolerance : float
        If abs(polarity score) <= tone_tolerance, label the tweet as neutral.
    one_hot_encode : bool
        If True, one-hot encodes any categorical features specified using Pandas' `get_dummies()` function.
    get_dummies_kwargs : dict
        Keyword arguments for Pandas' `get_dummies(...)` function. Only has an affect if `one_hot_encode` is True.

    Returns:
    --------
    DataFrame grouped by the given granularity with the columns specified in features.
    If features is the empty list, returns None.
    """

    # https://docs.python-guide.org/writing/gotchas/
    if features is None:
        features = ['date', 'username', 'tweet', 'cashtags', 'replies_count', 
                    'retweets_count', 'likes_count', 'subjectivity', 'polarity',
                    'tone', 'num_tweets','replies_sum', 'retweets_sum', 'likes_sum', 
                    'subjectivity_sum', 'polarity_sum', 'replies_avg', 'retweets_avg',
                    'likes_avg', 'subjectivity_avg', 'polarity_avg', 'tone_most_common']
    else:
        if len(features) == 0:
            return None
        features = features[:]
    
    #If empty dataframe, return an empty dataframe
    if len(df) == 0:
      print('EMPTY CSV FILE')
      return pd.DataFrame(columns = features)

    # Only keep rows where the tweet language is in English
    df = df.loc[df['language'] == 'en',:]
    
    # Drop rows that are NAN for all columns
    keep_cols = ["date", "username", "tweet", "cashtags", "replies_count", "retweets_count", "likes_count"]
    df = df.copy(deep=True).loc[:, keep_cols].dropna(how='all')

    # Replace any NaN values in numerical columns with 0 and in non-numerical columns with ''
    df['username'].fillna('', inplace=True)
    df['tweet'].fillna('', inplace=True)
    df['replies_count'].fillna(0, inplace=True)
    df['retweets_count'].fillna(0, inplace=True)
    df['likes_count'].fillna(0, inplace=True)

    # Clean the tweet text
    df['tweet'] = df['tweet'].apply(clean_txt)

    # Only keep rows where the company's stock symbol is in the cleaned tweet text
    df = df[df['tweet'].str.contains(symbol.lower())].reset_index(drop=True)

    # Convert columns to correct type
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df['replies_count'] = df['replies_count'].astype(int)
    df['retweets_count'] = df['retweets_count'].astype(int)
    df['likes_count'] = df['likes_count'].astype(int)

    #SENTIMENT ANALYSIS
    # Create three new columns: 'Subjectivity', 'Polarity', and 'Tone' with TextBlob sentiment analysis
    df = df.apply(lambda row: analyze_sentiment(row, tone_tolerance), axis=1)

    #GROUP BY TIME
    # Group the data based on the granularity requested + sort it
    df.set_index('date',inplace=True) #.resample() requires the date to be the index
    grouped_twitter = df.resample(granularity).agg({'username':list,
                                                    'tweet':list,
                                                    'cashtags':list,
                                                    'replies_count':list,
                                                    'retweets_count':list,
                                                    'likes_count':list,
                                                    'subjectivity':list,
                                                    'polarity':list,
                                                    'tone':list
                                                   })
    grouped_twitter.reset_index(inplace=True)
    grouped_twitter.sort_values(by=['date'], inplace=True)

    # Get features for _ per time
    if 'num_tweets' in features:
        grouped_twitter['num_tweets'] = [len(x) for x in grouped_twitter['tweet']]
    if 'replies_sum' in features:
        grouped_twitter['replies_sum'] = [sum(x) for x in grouped_twitter['replies_count']]
    if 'retweets_sum' in features:
        grouped_twitter['retweets_sum'] = [sum(x) for x in grouped_twitter['retweets_count']]
    if 'likes_sum' in features:
        grouped_twitter['likes_sum'] = [sum(x) for x in grouped_twitter['likes_count']]
    if 'subjectivity_sum' in features:
        grouped_twitter['subjectivity_sum'] = [sum(x) for x in grouped_twitter['subjectivity']]
    if 'polarity_sum' in features:
        grouped_twitter['polarity_sum'] = [sum(x) for x in grouped_twitter['polarity']]
    if 'replies_avg' in features:
        grouped_twitter['replies_avg'] = [np.mean(x) for x in grouped_twitter['replies_count']]
    if 'retweets_avg' in features:
        grouped_twitter['retweets_avg'] = [np.mean(x) for x in grouped_twitter['retweets_count']]
    if 'likes_avg' in features:
        grouped_twitter['likes_avg'] = [np.mean(x) for x in grouped_twitter['likes_count']]
    if 'subjectivity_avg' in features:
        grouped_twitter['subjectivity_avg'] = [np.mean(x) for x in grouped_twitter['subjectivity']]
    if 'polarity_avg' in features:
        grouped_twitter['polarity_avg'] = [np.mean(x) for x in grouped_twitter['polarity']]
    if 'tone_most_common' in features:
        most_common = pd.Series([max(set(lst), key=lst.count) if len(lst)>0 else 'Neutral' for lst in grouped_twitter['tone']])
        if one_hot_encode:
            encoded = pd.get_dummies(most_common, **get_dummies_kwargs)
            grouped_twitter = pd.concat([grouped_twitter, encoded], axis=1)
            features.extend(encoded.columns.tolist())
            features.remove('tone_most_common')
        else:
            grouped_twitter['tone_most_common'] = most_common
        
    return grouped_twitter.loc[:, features]

In [62]:
def combine_files(month_files):
    dfs = []
    for file in month_files:
        current_df = pd.read_csv(file, compression='gzip',engine='python')
        dfs.append(current_df)
    final_df = pd.concat(dfs)
    return final_df

In [117]:
#os.listdir() -> list of files within a directory
#os.path.join(path,file) ->  Combines two paths
#os.mkdir() -> creates a new file
#cp! "" copies a file to drive

#Every time you run, make sure to update the stock symbol at the end of the following file path
current_stock_symbol = 'DASH'
stock_symbol_location = '/content/gdrive/MyDrive/ieor142_twitter/twitter_data/Dash'

In [118]:
final_df = pd.DataFrame(columns=['date', 'num_tweets', 'replies_sum', 'retweets_sum', 'likes_sum', 'subjectivity_sum', 'polarity_sum', 'replies_avg', 'retweets_avg', 'likes_avg', 'subjectivity_avg', 'polarity_avg', 'tone_most_common'])

for year in os.listdir(stock_symbol_location):
  print(year)
  month_path = os.path.join(stock_symbol_location,year)
  for month in os.listdir(month_path):
    print(month)
    day_path = os.path.join(stock_symbol_location,year, month)
    for day in os.listdir(day_path):
      print(day)
      file_path = os.path.join(stock_symbol_location,year, month, day)

      #Read in the compressed csv file for the day
      day_df = pd.read_csv(file_path, compression='gzip',engine='python')
        
      #Clean + Combine (Make sure to update with corresponding stock symbol)
      day_results = aggregate(day_df, current_stock_symbol, 'D',features=['date', 'num_tweets', 'replies_sum', 'retweets_sum', 'likes_sum', 'subjectivity_sum', 'polarity_sum', 'replies_avg', 'retweets_avg', 'likes_avg', 'subjectivity_avg', 'polarity_avg', 'tone_most_common'])

      #Append to final results
      final_df = final_df.append(day_results)

tweets2021
03
2021-03-20-tweets-%24DASH-compressed.csv
2021-03-27-tweets-%24DASH-compressed.csv
2021-03-18-tweets-%24DASH-compressed.csv
2021-03-28-tweets-%24DASH-compressed.csv
2021-03-21-tweets-%24DASH-compressed.csv
2021-03-17-tweets-%24DASH-compressed.csv
2021-03-19-tweets-%24DASH-compressed.csv
2021-03-05-tweets-%24DASH-compressed.csv
2021-03-15-tweets-%24DASH-compressed.csv
2021-03-29-tweets-%24DASH-compressed.csv
2021-03-22-tweets-%24DASH-compressed.csv
2021-03-06-tweets-%24DASH-compressed.csv
2021-03-25-tweets-%24DASH-compressed.csv
2021-03-02-tweets-%24DASH-compressed.csv
2021-03-26-tweets-%24DASH-compressed.csv
2021-03-16-tweets-%24DASH-compressed.csv
2021-03-03-tweets-%24DASH-compressed.csv
2021-03-14-tweets-%24DASH-compressed.csv
2021-03-24-tweets-%24DASH-compressed.csv
2021-03-04-tweets-%24DASH-compressed.csv
2021-03-30-tweets-%24DASH-compressed.csv
2021-03-07-tweets-%24DASH-compressed.csv
2021-03-11-tweets-%24DASH-compressed.csv
2021-03-31-tweets-%24DASH-compressed.csv
20

In [119]:
final_df['date'] = pd.to_datetime(final_df["date"])
final_df_sorted = final_df.sort_values(by=["date"])
final_df_sorted.head()

Unnamed: 0,date,num_tweets,replies_sum,retweets_sum,likes_sum,subjectivity_sum,polarity_sum,replies_avg,retweets_avg,likes_avg,subjectivity_avg,polarity_avg,tone_most_common
0,2021-01-01,667,752,1995,7606,212.009402,61.974469,1.127436,2.991004,11.403298,0.317855,0.092915,Neutral
0,2021-01-02,252,94,190,1041,72.948675,22.756153,0.373016,0.753968,4.130952,0.289479,0.090302,Neutral
0,2021-01-03,215,183,550,1810,69.69721,30.378971,0.851163,2.55814,8.418605,0.324173,0.141298,Neutral
0,2021-01-04,294,190,315,1594,84.568408,35.793351,0.646259,1.071429,5.421769,0.287648,0.121746,Neutral
0,2021-01-05,188,67,115,706,67.062876,25.948826,0.356383,0.611702,3.755319,0.356717,0.138026,Positive


In [120]:
#Update with current stock symbol
file_name = current_stock_symbol + ".csv"
csv_file = final_df_sorted.to_csv(file_name)
!cp DASH.csv '/content/gdrive/MyDrive/ieor142_twitter'