# Data Preparation


In [47]:
# Import the required libraries
import pandas as pd
import numpy as np
import re

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
 

In [48]:
# Functions 
usa_states_fullname_regex = '(ALABAMA|ALASKA|ARIZONA|ARKANSAS|CALIFORNIA|COLORADO|CONNECTICUT|DELAWARE|FLORIDA|GEORGIA|HAWAII|' \
                            'IDAHO|ILLINOIS|INDIANA|IOWA|KANSAS|KENTUCKY|LOUISIANA|MAINE|MARYLAND|MASSACHUSETTS|MICHIGAN|MINNESOTA|MISSISSIPPI|MISSOURI|MONTANA|'\
                            'NEBRASKA|NEVADA|NEW\sHAMPSHIRE|NEWSJERSEY|NEW\sMEXICO|NEW\sYORK|NORTH\sCAROLINA|NORTH\sDAKOTA|'\
                            'OHIO|OKLAHOMA|OREGON|PENNSYLVANIA RHODE\sISLAND|'\
                            'SOUTH\sCAROLINA|SOUTH\sDAKOTA|TENNESSEE|TEXAS|UTAH|VERMONT|VIRGINIA|WASHINGTON|WEST\sVIRGINIA|WISCONSIN|WYOMING|USA)'


usa_states_regex = ',\s{1}(A[KLRZ]|C[AOT]|D[CE]|FL|GA|HI|I[ADLN]|K[SY]|LA|M[ADEINOST]|N[CDEHJMVY]|O[HKR]|P[AR]|RI|S[CD]|T[NX]|UT|V[AIT]|W[AIVY])'

#  Functions for Duplicate checks 
def get_exact_dups(df):
    '''
    Returns duplicates
    '''
    dups = df[df.duplicated()]
    return dups

def get_tweet_dups(df, col_names):
    '''
    Returns duplicates based on given column name
    '''
    dups = df[df.duplicated(subset=col_names)]
    return dups

def get_is_us_loc(loc_string):
    matches_abbrev = bool(re.search(usa_states_regex, loc_string.upper()))
    if not matches_abbrev:
        matches_full_name = bool(re.search(usa_states_fullname_regex, loc_string.upper())) 
    return (matches_abbrev or matches_full_name)

# Functions to call sentiment tools
def get_vader_sentiment(analyzer, tweet):
    tweet = tweet.replace('#','')  # we want things like #fail to be included in text
    vader_scores = analyzer.polarity_scores(tweet)
    compound_score = vader_scores['compound']
    vader_sentiment = None
    # using thresholds from VADER developers/researchers
    if (compound_score >= 0.05):
        vader_sentiment = 'positive'
    elif (compound_score < 0.05 and compound_score > -0.05):
        vader_sentiment = 'neutral'
    elif (compound_score <= -0.05):
        vader_sentiment = 'negative'
    return vader_sentiment

def get_text_blob_sentiment(tweet):
    polarity = TextBlob(tweet).sentiment.polarity
    # The polarity score is a float within the range [-1.0, 1.0]. 
    textblob_sentiment = None
    if (polarity > 0):
        textblob_sentiment = 'positive'
    elif (polarity == 0):
        textblob_sentiment = 'neutral'
    elif (polarity < 0):
        textblob_sentiment = 'negative'
    return textblob_sentiment  
    

def get_tools_match(vader_sentiment, tb_sentiment):
    return vader_sentiment == tb_sentiment

def get_tool_and_human_match(vader_sentiment, tb_sentiment, human_sentiment):
    return ((human_sentiment == vader_sentiment) or (human_sentiment == tb_sentiment))


In [104]:
external_data = pd.read_csv('../external_data/tweets_raw.csv', index_col='Unnamed: 0')
print(external_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 202645 entries, 0 to 202644
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Unnamed: 0.1   202645 non-null  int64 
 1   Content        202645 non-null  object
 2   Location       155123 non-null  object
 3   Username       202645 non-null  object
 4   Retweet-Count  202645 non-null  int64 
 5   Favorites      202645 non-null  int64 
 6   Created at     202645 non-null  object
dtypes: int64(3), object(4)
memory usage: 12.4+ MB
None


In [105]:
#rename columns
external_data.rename(columns = {'Unnamed: 0.1':'the_id', 'Content':'content', 'Location':'user_loc', 'Username':'user_screen_name', 
    'Retweet-Count':'retweet_count', 'Favorites':'fav_count', 'Created at': 'created_at'}, inplace = True) 
external_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 202645 entries, 0 to 202644
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   the_id            202645 non-null  int64 
 1   content           202645 non-null  object
 2   user_loc          155123 non-null  object
 3   user_screen_name  202645 non-null  object
 4   retweet_count     202645 non-null  int64 
 5   fav_count         202645 non-null  int64 
 6   created_at        202645 non-null  object
dtypes: int64(3), object(4)
memory usage: 12.4+ MB


In [109]:
# Drop the rows with null user_location and Duplicated content
external_data.dropna(subset=['user_loc'], inplace=True)
external_data.drop_duplicates(subset=['content'], inplace=True)
print(external_data.shape)

(139410, 7)


In [113]:
# Check location for a US state (using regex). We only want to use Tweets with a US location
external_data['is_us_loc'] = external_data.apply(lambda row: get_is_us_loc(row['user_loc']), axis=1)
us_only_data = external_data[external_data['is_us_loc'] == True]
us_only_data.shape
us_only_data.drop('is_us_loc', axis=1, inplace=True)

In [114]:
# Get the char count
us_only_data['char_count'] = us_only_data.apply(lambda row: len(row['content']), axis=1)
us_only_data.tail()

Unnamed: 0,the_id,content,user_loc,user_screen_name,retweet_count,fav_count,created_at,char_count
202634,2676,Told my kids this was real #virtuallearning #s...,"Knoxville, TN",donnyr71,0,5,2020-08-06 00:47:55,84
202635,2677,Attention ALL students transitioning to colleg...,"Atlanta, GA",TeeTaylorMade,0,1,2020-08-06 00:47:24,277
202636,2678,Attention ALL students transitioning to colleg...,"Chicago, IL",EZTAYLORFDN,0,1,2020-08-06 00:47:09,277
202637,2679,I work with the best English 10 teacher in the...,"Roanoke, VA",abchitwood,0,2,2020-08-06 00:42:25,253
202639,2681,Join us on August 21st at 12 pm CDT for Virtua...,"Tulsa, OK",LindaJatJCG,0,0,2020-08-06 00:30:58,128


In [116]:
# Keep only the tweets with 100 and above characters
us_only_data = us_only_data[us_only_data['char_count'] >= 100]
us_only_data.shape

(58941, 8)

In [124]:
# Get the VADER and TextBlob sentiments
analyzer = SentimentIntensityAnalyzer()
us_only_data['vader_sentiment'] = us_only_data.apply(lambda row: get_vader_sentiment(analyzer, row['content']), axis=1)
us_only_data['text_blob_sentiment'] = us_only_data.apply(lambda row: get_text_blob_sentiment(row['content']), axis=1)

# Mark where the two tools agree
us_only_data['tools_match'] = us_only_data.apply(lambda row: get_tools_match(row['vader_sentiment'], row['text_blob_sentiment']), axis=1)

# keep only the tweets where the tools agree 
us_only_data = us_only_data[us_only_data['tools_match']]
us_only_data.head(3)

Unnamed: 0,the_id,content,user_loc,user_screen_name,retweet_count,fav_count,created_at,char_count,sentiment_method,sentiment,vader_sentiment,text_blob_sentiment,tools_match
9,9,“Instructional Considerations for the 2020-21 ...,"Illinois, USA",Erik_Youngman,0,2,2020-08-02 00:10:26,276,tools,neutral,neutral,neutral,True
10,10,With all the uncertainty of what September wil...,"Lyndhurst, NJ",Renee_LoBue,0,0,2020-08-01 23:57:31,264,tools,positive,positive,positive,True
11,11,Check this out on Wakelet - Digital learning a...,"Cary, NC",SupriyaVasu,0,0,2020-08-01 23:20:38,133,tools,neutral,neutral,neutral,True


In [125]:
# Now create cols for sentiment and sentiment method
us_only_data['sentiment_method'] = 'tools'
us_only_data['sentiment'] = us_only_data.apply(lambda row: row['vader_sentiment'], axis=1)

# delete the now un-needed columns
us_only_data.drop('vader_sentiment', axis=1, inplace=True)
us_only_data.drop('text_blob_sentiment', axis=1, inplace=True)
us_only_data.drop('tools_match', axis=1, inplace=True)
us_only_data.head(3)

Unnamed: 0,the_id,content,user_loc,user_screen_name,retweet_count,fav_count,created_at,char_count,sentiment_method,sentiment
9,9,“Instructional Considerations for the 2020-21 ...,"Illinois, USA",Erik_Youngman,0,2,2020-08-02 00:10:26,276,tools,neutral
10,10,With all the uncertainty of what September wil...,"Lyndhurst, NJ",Renee_LoBue,0,0,2020-08-01 23:57:31,264,tools,positive
11,11,Check this out on Wakelet - Digital learning a...,"Cary, NC",SupriyaVasu,0,0,2020-08-01 23:20:38,133,tools,neutral
12,12,Happy Friendship Day!\n#rdnums #nagaland #kohi...,"Kohima, India",rdnums,2,1,2020-08-01 23:17:09,264,tools,positive
13,13,Beat the summer heat with over 400 cool games ...,"Providence, RI",ABCyaGames,0,2,2020-08-01 23:00:00,146,tools,positive


In [126]:
# SAVE to file
us_only_data.to_csv('../data/us_only_external_data_tweets_TOOL_labeled.csv', index='the_id')

## Now prepare the data I collected in January.  
### Only keep tweets of 100 characters or more (same criteria applied to external dataset of tweets)
### I labeled 356 tweets for sentiment. For time contraints, use the tools to labels the rest - use sentiment from tools where both agree.

### Tweets from Query in January - HUMAN sentiment label

In [71]:
human_labeled_q_tweets =  pd.read_csv('../data/jan_queried_tweets_HUMAN_labeled.csv', index_col='_id')
human_labeled_q_tweets.head(3)

Unnamed: 0_level_0,id_str,content,user_loc,user_screen_name,retweet_count,fav_count,created_at,sentiment
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5ffde71b5e4953000d99dc7c,1.348996e+18,These really are critical.\n2yrs ago I took 9 ...,"Texas, USA",summers_llm,0,0,2021-01-12 14:11:21,neutral
5ffde71b5e4953000d99dc7d,1.348986e+18,ConnectEd After School Lesson Grades K-2\nThur...,"North Dakota, USA",ncecnd,0,0,2021-01-12 13:30:04,neutral
5ffde71b5e4953000d99dc7e,1.348978e+18,Don't forget to register for our FREE Remote a...,New York City,ReadWorks,0,0,2021-01-12 13:01:36,neutral


In [72]:
# Keep only the tweets 100 chars and over
human_labeled_q_tweets['char_count'] = human_labeled_q_tweets.apply(lambda row: len(row['content']), axis=1)
human_labeled_q_tweets = human_labeled_q_tweets[human_labeled_q_tweets['char_count'] >= 100]

Unnamed: 0_level_0,id_str,content,user_loc,user_screen_name,retweet_count,fav_count,created_at,sentiment,char_count
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5fff3c1ae4a2323a8dcaedb0,1.348694e+18,"To view the ""COVID-19 Fact Sheet K-12 School Q...","Carlisle, OH",CarlisleLocal,1,2,2021-01-11 18:11:22,neutral,156
5fff3c1ae4a2323a8dcaedb2,1.348672e+18,Teachers K-12 are now eligible to get vaccinat...,Utah,UtahCoronavirus,10,39,2021-01-11 16:43:02,neutral,257
5fff3c1be4a2323a8dcaedb4,1.348653e+18,Some good news: my county (Loudoun) has offici...,"Ashburn, Virginia",SueFliess,0,7,2021-01-11 15:29:35,positive,196


In [79]:
# Flag these tweets as having their sentiment set by a human
human_labeled_q_tweets['sentiment_method'] = 'human'
human_labeled_q_tweets.tail(3)

Unnamed: 0_level_0,id_str,content,user_loc,user_screen_name,retweet_count,fav_count,created_at,sentiment,char_count,sentiment_method
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5fff3c1ae4a2323a8dcaedb0,1.348694e+18,"To view the ""COVID-19 Fact Sheet K-12 School Q...","Carlisle, OH",CarlisleLocal,1,2,2021-01-11 18:11:22,neutral,156,human
5fff3c1ae4a2323a8dcaedb2,1.348672e+18,Teachers K-12 are now eligible to get vaccinat...,Utah,UtahCoronavirus,10,39,2021-01-11 16:43:02,neutral,257,human
5fff3c1be4a2323a8dcaedb4,1.348653e+18,Some good news: my county (Loudoun) has offici...,"Ashburn, Virginia",SueFliess,0,7,2021-01-11 15:29:35,positive,196,human


In [80]:
human_labeled_q_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 348 entries, 5ffde71b5e4953000d99dc7c to 5fff3c1be4a2323a8dcaedb4
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id_str            348 non-null    float64
 1   content           348 non-null    object 
 2   user_loc          348 non-null    object 
 3   user_screen_name  348 non-null    object 
 4   retweet_count     348 non-null    int64  
 5   fav_count         348 non-null    int64  
 6   created_at        348 non-null    object 
 7   sentiment         348 non-null    object 
 8   char_count        348 non-null    int64  
 9   sentiment_method  348 non-null    object 
dtypes: float64(1), int64(3), object(6)
memory usage: 29.9+ KB


In [81]:
# save to file 
human_labeled_q_tweets.to_csv('../data/jan_2021_queried_tweets_HUMAN_labeled.csv', index_label='_id')

### Tweets from query in January - not labeled yet


In [135]:
q_tweets =  pd.read_csv('../data/jan_queried_tweets_NO_label.csv', index_col='_id')
q_tweets.head(3)

Unnamed: 0_level_0,id_str,content,user_loc,user_screen_name,retweet_count,fav_count,created_at
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5ffde71f5e4953000d99dd55,1.348735e+18,Virtual learning has caused a fair share of fr...,Wisconsin,Edficiency,0,0,2021-01-11 20:55:39
5ffde71f5e4953000d99dd58,1.348704e+18,Preparing for high school is an exciting time!...,"St. Augustine, Florida",SJCSD,0,0,2021-01-11 18:51:41
5ffde71f5e4953000d99dd59,1.348685e+18,Updated list of Wi-Fi Hotspots in Accomack Cou...,"Oak Hall, Virginia",AMSPANTHERS,2,2,2021-01-11 17:33:40


In [136]:
q_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 222 entries, 5ffde71f5e4953000d99dd55 to 5fff3c11e4a2323a8dcaed8b
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id_str            222 non-null    float64
 1   content           222 non-null    object 
 2   user_loc          222 non-null    object 
 3   user_screen_name  222 non-null    object 
 4   retweet_count     222 non-null    int64  
 5   fav_count         222 non-null    int64  
 6   created_at        222 non-null    object 
dtypes: float64(1), int64(2), object(4)
memory usage: 13.9+ KB


In [137]:
# Keep only the tweets 100 chars and over
q_tweets['char_count'] = q_tweets.apply(lambda row: len(row['content']), axis=1)
q_tweets = q_tweets[q_tweets['char_count'] >= 100]
q_tweets.tail(3)

Unnamed: 0_level_0,id_str,content,user_loc,user_screen_name,retweet_count,fav_count,created_at,char_count
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5ffdfb71d17caa7111610ee9,1.347343e+18,@Eliana67045140 @UFT @ChalkbeatNY @NYCMayor @N...,NYC,EducationMatt17,0,0,2021-01-08 00:44:09,362
5ffe2b47fcdca70acef8117a,1.346892e+18,Learn 3 easy ways to take #RemoteLearning to t...,"Boulder, Colorado",wcet_info,0,0,2021-01-06 18:52:00,156
5fff3c11e4a2323a8dcaed8b,1.34902e+18,My husband (a middle school music teacher) and...,"Albany, NY",K_J_Waite,1,6,2021-01-12 15:47:13,156


In [138]:
# Get the VADER and TextBlob sentiments
analyzer = SentimentIntensityAnalyzer()
q_tweets['vader_sentiment'] = q_tweets.apply(lambda row: get_vader_sentiment(analyzer, row['content']), axis=1)
q_tweets['text_blob_sentiment'] = q_tweets.apply(lambda row: get_text_blob_sentiment(row['content']), axis=1)

# Mark where the two tools agree
q_tweets['tools_match'] = q_tweets.apply(lambda row: get_tools_match(row['vader_sentiment'], row['text_blob_sentiment']), axis=1)

# keep only the tweets where the tools agree 
q_tweets = q_tweets[q_tweets['tools_match']]
print(q_tweets.shape)
q_tweets.head(3)


(130, 11)


Unnamed: 0_level_0,id_str,content,user_loc,user_screen_name,retweet_count,fav_count,created_at,char_count,vader_sentiment,text_blob_sentiment,tools_match
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5ffde71f5e4953000d99dd55,1.348735e+18,Virtual learning has caused a fair share of fr...,Wisconsin,Edficiency,0,0,2021-01-11 20:55:39,272,positive,positive,True
5ffde71f5e4953000d99dd58,1.348704e+18,Preparing for high school is an exciting time!...,"St. Augustine, Florida",SJCSD,0,0,2021-01-11 18:51:41,221,positive,positive,True
5ffde71f5e4953000d99dd5d,1.348672e+18,The Nature-based 4K Parent Night for enrollmen...,"Newburg, WI",RiveredgeNC,0,0,2021-01-11 16:42:40,276,positive,positive,True


In [139]:
# Now create cols for sentiment and sentiment method
q_tweets['sentiment_method'] = 'tools'
q_tweets['sentiment'] = q_tweets.apply(lambda row: row['vader_sentiment'], axis=1)

# delete the now un-needed columns
q_tweets.drop('vader_sentiment', axis=1, inplace=True)
q_tweets.drop('text_blob_sentiment', axis=1, inplace=True)
q_tweets.drop('tools_match', axis=1, inplace=True)
q_tweets.head(3)

Unnamed: 0_level_0,id_str,content,user_loc,user_screen_name,retweet_count,fav_count,created_at,char_count,sentiment_method,sentiment
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5ffde71f5e4953000d99dd55,1.348735e+18,Virtual learning has caused a fair share of fr...,Wisconsin,Edficiency,0,0,2021-01-11 20:55:39,272,tools,positive
5ffde71f5e4953000d99dd58,1.348704e+18,Preparing for high school is an exciting time!...,"St. Augustine, Florida",SJCSD,0,0,2021-01-11 18:51:41,221,tools,positive
5ffde71f5e4953000d99dd5d,1.348672e+18,The Nature-based 4K Parent Night for enrollmen...,"Newburg, WI",RiveredgeNC,0,0,2021-01-11 16:42:40,276,tools,positive


In [140]:
# save to file 
q_tweets.to_csv('../data/jan_2021_queried_tweets_TOOL_labeled.csv', index_label='_id')

### Now prepare the Tweets that I collected via Streaming over several days in January


In [141]:
s_tweets =  pd.read_csv('../data/jan_streaming_tweets_NO_label.csv', index_col='_id')
s_tweets.head(3)

Unnamed: 0_level_0,id_str,content,user_loc,user_screen_name,retweet_count,fav_count,created_at
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5ffde42364e25f9f26125929,1.349054e+18,Open Forum!\n\nVirtual Parent Hangout - Januar...,"Indianapolis, IN",ISDHoosiers,0,0,2021-01-12 18:02:06
5ffde4c064e25f9f2612592a,1.349055e+18,@besf0rt Never forget in 2000 profiling a Japa...,Florida hellmouth,ImperialeNancy,0,0,2021-01-12 18:04:43
5ffde54d64e25f9f2612592b,1.349055e+18,"“This year, the “mothers and others” are turni...","Suburban DC, Maryland",gunsensemelissa,0,0,2021-01-12 18:07:04


In [142]:
s_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 386 entries, 5ffde42364e25f9f26125929 to 6000ae13ea6b89c2fd8a27be
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id_str            386 non-null    float64
 1   content           386 non-null    object 
 2   user_loc          386 non-null    object 
 3   user_screen_name  386 non-null    object 
 4   retweet_count     386 non-null    int64  
 5   fav_count         386 non-null    int64  
 6   created_at        386 non-null    object 
dtypes: float64(1), int64(2), object(4)
memory usage: 24.1+ KB


In [143]:
# Keep only the tweets 100 chars and over
s_tweets['char_count'] = s_tweets.apply(lambda row: len(row['content']), axis=1)
s_tweets = s_tweets[s_tweets['char_count'] >= 100]
s_tweets.tail(3)

Unnamed: 0_level_0,id_str,content,user_loc,user_screen_name,retweet_count,fav_count,created_at,char_count
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6000aafbea6b89c2fd8a27ba,1.349817e+18,"For teaching at K-12, you need at least a bach...",USA,JIMMYESL,0,0,2021-01-14 20:35:02,306
6000abcdea6b89c2fd8a27bc,1.349818e+18,Pennyrile Health Department Expects COVID Vacc...,"Princeton, KY",wpkyonline,0,0,2021-01-14 20:38:32,243
6000ae13ea6b89c2fd8a27be,1.349821e+18,"@joml76 My husband is a K-12 school employee, ...","Michigan, USA",tvfan201,0,0,2021-01-14 20:48:14,192


In [144]:
# Get the VADER and TextBlob sentiments
analyzer = SentimentIntensityAnalyzer()
s_tweets['vader_sentiment'] = s_tweets.apply(lambda row: get_vader_sentiment(analyzer, row['content']), axis=1)
s_tweets['text_blob_sentiment'] = s_tweets.apply(lambda row: get_text_blob_sentiment(row['content']), axis=1)

# Mark where the two tools agree
s_tweets['tools_match'] = s_tweets.apply(lambda row: get_tools_match(row['vader_sentiment'], row['text_blob_sentiment']), axis=1)

# keep only the tweets where the tools agree 
s_tweets = s_tweets[s_tweets['tools_match']]
print(s_tweets.shape)
s_tweets.head(3)

(193, 11)


Unnamed: 0_level_0,id_str,content,user_loc,user_screen_name,retweet_count,fav_count,created_at,char_count,vader_sentiment,text_blob_sentiment,tools_match
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5ffde60d64e25f9f2612592c,1.349056e+18,"If a student is missing class a bunch, their g...","Provo, UT",avatargrace,0,0,2021-01-12 18:10:16,275,negative,negative,True
5ffdf9c180d443513a191637,1.349077e+18,"@JamesTodaroMD @ConservMama17 I went to proms,...",Mountains of California,AudreyJeanne,0,0,2021-01-12 19:34:19,308,positive,positive,True
5ffdfcbbed845d2d2c79e966,1.34908e+18,Last weeks blog by @RachelJTeaches provides an...,"Arlington, VA",intellispark,0,0,2021-01-12 19:47:01,200,positive,positive,True


In [145]:
# Now create cols for sentiment and sentiment method
s_tweets['sentiment_method'] = 'tools'
s_tweets['sentiment'] = s_tweets.apply(lambda row: row['vader_sentiment'], axis=1)
s_tweets.head(3)

# delete the now un-needed columns
s_tweets.drop('vader_sentiment', axis=1, inplace=True)
s_tweets.drop('text_blob_sentiment', axis=1, inplace=True)
s_tweets.drop('tools_match', axis=1, inplace=True)
s_tweets.head(3)

Unnamed: 0_level_0,id_str,content,user_loc,user_screen_name,retweet_count,fav_count,created_at,char_count,sentiment_method,sentiment
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5ffde60d64e25f9f2612592c,1.349056e+18,"If a student is missing class a bunch, their g...","Provo, UT",avatargrace,0,0,2021-01-12 18:10:16,275,tools,negative
5ffdf9c180d443513a191637,1.349077e+18,"@JamesTodaroMD @ConservMama17 I went to proms,...",Mountains of California,AudreyJeanne,0,0,2021-01-12 19:34:19,308,tools,positive
5ffdfcbbed845d2d2c79e966,1.34908e+18,Last weeks blog by @RachelJTeaches provides an...,"Arlington, VA",intellispark,0,0,2021-01-12 19:47:01,200,tools,positive


In [146]:
# save to file 
s_tweets.to_csv('../data/jan_2021_streaming_tweets_TOOL_labeled.csv', index_label='_id')