In [4]:
'''
Approach that uses sentiment analysis using the Afinn library.  Sentiment is determined for individual tweets as well as in aggregate based
on days when tweets are available.


Created on Jun 18, 2020

@author: mark
'''
import os
import numpy as np
from os import listdir
import collections
from collections import Counter
from nltk.tokenize import word_tokenize 
import datetime
import csv

from afinn import Afinn
from sklearn.metrics.cluster.tests.test_supervised import score_funcs


class Sentiment:
    
    #the Afinn setiment library
    afinn = Afinn()
    
    '''
    This determines the affinity (sentiment) score from a tweet.
    
    @param tweet- the tweet to score
    @return score- the sentiment score
    '''
    def get_affinity_score(self, tweet):
        
        # the score value
        score=0
        
        #score if the length of the tweet is at least 1
        if len(tweet)>0:
            score=self.afinn.score(tweet) / len(tweet)
            return score
        
        #otherwise no score
        else:
            return score
            
   
    '''
    Method to load csv data from the modified folder
    '''
    def loadData(self):
        
        #get the path for the modified and sentiment directories
        pn=os.path.abspath("")
        pn=pn.split("src")[0]  
        directory=os.path.join(pn,'modified')
        output_directory=os.path.join(pn,'sentiment')

        #now read the files in the modified directory to get relevant files
        try:
            
            #iteratre through the directory
            for f in listdir(directory):
                rows=[]
                
                #skip non-csv files
                if '.csv' not in f:
                    continue
                
             
                #have containers for the data based on the twitter text, 
                texts=[]
                
                #date reference for tweet scores
                time={}
                
                #day-based reference to texts
                day={}
                
                #container for retweets over time
                retwts={}
                
                #open file to read
                with open(os.path.join(directory,f),'r') as csvfile:
                    reader = csv.DictReader(csvfile)
            
                    #read the rows
                    for row in reader:
                        
                        #get the tweet text
                        text=row['Text']
                        
                        #the date of the text
                        date_time=row['Datetime'].split(" ")[0]
                        
                        #covert to a date object (year-month-day)
                        date_time_obj = datetime.datetime.strptime(date_time, '%Y-%m-%d')
                        
                        #put date object in place of datetime in data
                        row['Datetime']=date_time_obj.date()
                        
                        #get retweets
                        retweets=int(row['Retweets'])
                        
                        #get sentiment score of tweet
                        score=self.get_affinity_score(text)
                        
                        #containers for text, sentiment and retweet data
                        inputT=[]
                        dd=[]
                        retweet=[]
                        
                        #see if current date exists in container
                        if  date_time_obj.date() in time:
                            
                            #organize information based on time
                            inputT=time[date_time_obj.date()]
                            dd=day[date_time_obj.date()]
                            
                            #score and text data
                            inputT.append(score)
                            dd.append(text)
                            
                            #retweet data
                            retweet=retwts[date_time_obj.date()]
                            retweet.append(retweets)
        
                        else:
                            
                            #if containers do not exist then add to new lists the sentiment, text, and retweet data
                            inputT.append(score)
                            dd.append(text)
                            retweet.append(retweets)
                        
                        #put data (tweets, sentiment score, and retweets) into dictionaries 
                        time[date_time_obj.date()]=inputT
                        day[date_time_obj.date()]=dd
                        retwts[date_time_obj.date()]=retweet
                        
                        #raw sentiment score for a tweet 
                        row['Score']=score
                        
                        #row data are put back to output to individual tweet data in the sentiment folder
                        rows.append(row)
                        
                        #tokenize words
                        twords=word_tokenize(text)
                        for tt in twords:
                            texts.append(tt)
                            
                #now do word counting to see top words in text
                word_counts = Counter(texts)
                
                #find the 100 most common words for all the data
                t=word_counts.most_common(100)
                
                #most common term output goes to the sentiment output directory
                self.most_common_output(t,os.path.join(output_directory,'common_100'+"_"+f))
                fle=os.path.join(output_directory,'sentiment'+"_"+f)       
                self.output(rows,fle)
                
                #call the date-based sentiment output
                self.doTimeBasedOutput(time,output_directory,day,retwts,f)
                
        except IOError:
            print ("Could not read file:", csvfile)
    '''
    Method to create time-based output for tweets.
    
    @param time- The day of when a given tweet is made
    @paramr- output_directory- the output directory which is the sentiment directory
    @param- day the day reference to associate with twitter data (retweets)
    @param- retwts represents the retweet data used
    @param- the file to output the results to.
    '''
    def doTimeBasedOutput(self,date,output_directory,day,retwts,f):
        
        #fieldnames to output
        fieldnames = ['Date','Mean Score','Median Score','Tweets', 'Retweets','Standard Deviation','Top 15']
        
        #the file output path
        fileOutput=os.path.join(output_directory,'sentiment_over_time'+"_"+f) 
        
        tts=[]
        with open(fileOutput, 'wt') as csvf:
            
            #write the output file
            writer = csv.DictWriter(csvf, fieldnames=fieldnames)
            
            #write the file header
            writer.writeheader() 
            for t in date:
                
                #do output based on date. Get the number of tweets, sentiment score, and number of retweets
                inpt=date[t]
                dd=day[t]
                rtweets=retwts[t]
                texts=[]

                #get the number of tweets
                n=len(dd)
                
                #get the most common terms for a given date
                for tt in dd:
                    twords=word_tokenize(tt)
                    for w in twords:
                        texts.append(w)
                        tts.append(w)
                
                #create a word counter
                word_counts = Counter(texts)
                
                #do word counts (top 15 terms for a date)
                z=word_counts.most_common(15)
                tz=[l for l, t in z]
                
                #get the mean sentiment score
                
                #get the standard deviation
                mean=np.mean(inpt)
                std=np.std(inpt)
                
                #get the median value
                median=np.median(inpt)
                
                #get the sum of retweets
                rts=np.sum(rtweets)
            
                writer.writerow({'Date': str(t),
                             'Mean Score':str(mean),'Median Score':str(median),'Tweets':str(n),
                             'Retweets':str(rts),'Standard Deviation':str(std),'Top 15': str(tz)})
        
       
   
   
        
    '''
    Method to output the most common terms.
    @param t- the term data for most common terms
    @param fileOutput- the file to output the results to
    '''
    def most_common_output(self,t,fileOutput):
        
        #fieldnames for the output file
        fieldnames=[]
        
        #the output data organized by the most common terms
        output={}
        for l, d in t:
            fieldnames.append(l)
            output[l]=d
           
            
        #write the output 
        with open(fileOutput, 'wt') as csvf:
            writer = csv.DictWriter(csvf, fieldnames=fieldnames)

            writer.writeheader()  
            writer.writerow(output)
           
    '''
    Method to output sentiment for individual tweets.
    @param data- the data for given tweets
    @param fileOutput- the file to output data
    '''   
    def output(self,data,fileOutput):
        
        #the fieldnames in the output file
        fieldnames = ['Datetime','ID','Score','Link','Text','Username','Retweets','Hashtags','Geolocation']
        with open(fileOutput, 'wt') as csvf:
            
            #write the output
            writer = csv.DictWriter(csvf, fieldnames=fieldnames)

            writer.writeheader()  
            
            #iterate through data rows and write out
            for f in data:
                writer.writerow({'Datetime': str(f['Datetime']),
                             'ID':str(f['ID']),'Score':str(f['Score']),'Link':str(f['Link']),
                             'Text':str(f['Text']),'Username':str(f['Username']),'Retweets':str(f['Retweets']),'Hashtags':str(f['Hashtags']),
                              'Geolocation':str(f['Geolocation'])})
    
    '''
    Method to run the sentiment analysis.
    '''
    def run(self):
        #load the data and run analysis
        self.loadData()
        
        #finished
        print('Finished')

if __name__ == '__main__':
    s=Sentiment()
    s.run()

Finished


In [8]:
pip install pytest


Collecting pytest
  Using cached pytest-5.4.3-py3-none-any.whl (248 kB)
Collecting more-itertools>=4.0.0
  Downloading more_itertools-8.4.0-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 455 kB/s eta 0:00:01
Collecting py>=1.5.0
  Downloading py-1.9.0-py2.py3-none-any.whl (99 kB)
[K     |████████████████████████████████| 99 kB 1.2 MB/s eta 0:00:01
[?25hCollecting pluggy<1.0,>=0.12
  Using cached pluggy-0.13.1-py2.py3-none-any.whl (18 kB)
Collecting packaging
  Using cached packaging-20.4-py2.py3-none-any.whl (37 kB)
[31mERROR: fastai 1.0.61 requires bottleneck, which is not installed.[0m
[31mERROR: fastai 1.0.61 requires numexpr, which is not installed.[0m
[31mERROR: cherrypy 18.6.0 requires cheroot>=8.2.1, which is not installed.[0m
[31mERROR: cherrypy 18.6.0 requires jaraco.collections, which is not installed.[0m
[31mERROR: cherrypy 18.6.0 requires portend>=2.1.1, which is not installed.[0m
[31mERROR: cherrypy 18.6.0 requires zc.lockfile, which

0 10
1 9
2 8
3 7
4 6
5 5
6 4
7 3
8 2
9 1


There's nothing quite like sleeping under the stars. https://t.co/kLkIDjQcmm
Really enjoy the Robert Pattinson bender I &amp; the entire rest of the Internet are on right now
RT @erspamer_matt: ‘90s kids remember 😍 https://t.co/53swJWJQOK
When your friend calls to say they just texted you a clip from an Al Pacino movie https://t.co/EJz80tJryM
RT @longdrivesouth: Journalist gets tear-gas direct in the face by #Bolivia police, reports right through it. https://t.co/iy2ysgoh2Q
RT @birth_marxist: anything but making transit free. https://t.co/Wtgqmgfvuy
RT @degendering: Hello here is your daily reminder that “TERF” is not a synonym for “transphobic person” or “transmisogynist.” 
Hillary Cli…
RT @NoTotally: Anyway this sounds a lot like "rational" centrism, which his administration also was, and the mistake of centrism as an ideo…
RT @waifujaq: me: *shopping in the men’s section @ the thrift store*
grown white men: 👁👁
RT @Kurtisaj: Playing Sims 4 University while I’m behind on my actual deg

RT @thehill: AG Bill Barr: "Immediately after President Trump won election, opponents inaugurated what they call 'The Resistance' and they…
RT @marklevinshow: Attorney General Bill Barr slams the constant efforts to undermine the president in a speech to the Federalist Society…
#NewHoaxSameSwamp https://t.co/GIg9R9Txe0
https://t.co/YvlBpfsiTa
THANK YOU! #MAGA #KAG https://t.co/Pcq7IbehVp
https://t.co/8h6ZmdGlPf https://t.co/shaijXMXli
RT @RepLeeZeldin: History is going to be so MASSIVELY unkind towards Pelosi, Schiff and the rest of Congressional Democrats who continue su…
RT @RepMarkMeadows: The Democrats second day impeachment witness, Ambassador Yovanovitch, has no information on any of the relevant questio…
RT @EliseStefanik: Obama’s own State Dept. was so concerned about conflicts of interest from Hunter Biden’s role at Burisma that they raise…
RT @EliseStefanik: The facts are clear, confirmed by our witness, Ambassador Yovanovitch: defensive lethal aid was provided to Ukraine not

Unnamed: 0,tweet_id,text,favorite_count,retweet_count,created_at,source,reply_to_status,reply_to_user,retweets,favorites
0,1195582934578798593,"RT @thehill: AG Bill Barr: ""Immediately after ...",0,5602,2019-11-16 06:02:39,Twitter for iPhone,,,5602,0
1,1195582806212186112,RT @marklevinshow: Attorney General Bill Barr ...,0,6198,2019-11-16 06:02:08,Twitter for iPhone,,,6198,0
2,1195531896433848320,#NewHoaxSameSwamp https://t.co/GIg9R9Txe0,43007,17312,2019-11-16 02:39:50,Twitter Media Studio,,,17312,43007
3,1195500061721026560,https://t.co/YvlBpfsiTa,77180,24375,2019-11-16 00:33:20,Twitter for iPhone,,,24375,77180
4,1195495876631957504,THANK YOU! #MAGA #KAG https://t.co/Pcq7IbehVp,43750,12050,2019-11-16 00:16:42,Twitter for iPhone,,,12050,43750
