# Data Portal
<p>
    
This notebook is for producing and manipulating social media dataset we will use in the Text Data Climate Shock project. 

In Part One, a social media microblog with geo-tags will be loaded, cleaned, and translated if it contains text in language other than English. 

In Part Two, six kinds of sentiment scores for each tweet will be calculated and [combined for a weighted sum in Pandas DataFrame].
</p>

<h5>
supported input data: Twitter, [Sina Weibo]

</h5>
<h5>
supported translator: Google Translate API, Microsoft Azure Translator Text API, [offline dictionary]
</h5>
<h5>
supported sentiment measure: AFINN,Textblob, Hedonometer, VADER, SentiWordNet, WKWSCI Sentiment Lexicon, [LIWC 2015]

</h5>
<p>in []: Under development / Pending to use
    </p>




In [None]:
# import all libraries needed
from __future__ import division
import numpy as np
import requests, uuid
import os
import json
import pandas as pd
import pymysql
import pymysql.cursors
from textblob import TextBlob
import re
import math
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
import preprocessor as p
import emoji
import glob
import seaborn as sns
from googletrans import Translator
import csv

In [None]:
#install/download if needed
#!pip install pymysql --user
#!pip3 install --user tweet-preprocessor
#!pip3 install --user emoji
#!pip3 install --user googletrans
#!pip3 install -U --user textblob
#!pip3 install --user vaderSentiment

#nltk.download('sentiwordnet')
#!python3 -m textblob.download_corpora
#!wget http://www2.imm.dtu.dk/pubdb/views/edoc_download.php/6010/zip/imm6010.zip 


In [359]:
#DEFINE GLOBAL VARIABLES

#Paths
READ_PATH = 'archive_india/'
SAVE_PATH = 'india_score/'

AFINN_PATH = 'sentiment_lexicon/AFINN/AFINN-111.txt'
HEDONO_PATH = "sentiment_lexicon/Data_Set_S1.txt"
WKWSCI_PATH = 'sentiment_lexicon/WKWSCISenti.tab'
#AZURE VARIABLE
AZURE_KEY = '5f109b1515f642e3a05b8d80f24a9cdd'


# field name that will calculate for sentiment
TEXT_FIELD = 'en_text'

#Define a function convert 2-digit language code to language name in English
iso_639_choices = [('ab', 'Abkhaz'),('aa', 'Afar'),('af', 'Afrikaans'),('ak', 'Akan'),('sq', 'Albanian'),
('am', 'Amharic'),('ar', 'Arabic'),('an', 'Aragonese'),('hy', 'Armenian'),('as', 'Assamese'),('av', 'Avaric'),
('ae', 'Avestan'),('ay', 'Aymara'),('az', 'Azerbaijani'),('bm', 'Bambara'),('ba', 'Bashkir'),('eu', 'Basque'),
('be', 'Belarusian'),('bn', 'Bengali'),('bh', 'Bihari'),('bi', 'Bislama'),('bs', 'Bosnian'),('br', 'Breton'),('bg', 'Bulgarian'),
('my', 'Burmese'),('ca', 'Catalan; Valencian'),('ch', 'Chamorro'),('ce', 'Chechen'),('ny', 'Chichewa; Chewa; Nyanja'),
('zh', 'Chinese'),('cv', 'Chuvash'),('kw', 'Cornish'),('co', 'Corsican'),('cr', 'Cree'),('hr', 'Croatian'),('cs', 'Czech'),
('da', 'Danish'),('dv', 'Divehi; Maldivian;'),('nl', 'Dutch'),('dz', 'Dzongkha'),('en', 'English'),
('eo', 'Esperanto'),('et', 'Estonian'),('ee', 'Ewe'),('fo', 'Faroese'),('fj', 'Fijian'),('fi', 'Finnish'),
('fr', 'French'),('ff', 'Fula'),('gl', 'Galician'),('ka', 'Georgian'),('de', 'German'),
('el', 'Greek, Modern'),('gn', 'Guaraní'),('gu', 'Gujarati'),('ht', 'Haitian'),('ha', 'Hausa'),
('he', 'Hebrew (modern)'),('hz', 'Herero'),('hi', 'Hindi'),('ho', 'Hiri Motu'),('hu', 'Hungarian'),
('ia', 'Interlingua'),('id', 'Indonesian'),('ie', 'Interlingue'),('ga', 'Irish'),('ig', 'Igbo'),
('ik', 'Inupiaq'),('io', 'Ido'),('is', 'Icelandic'),('it', 'Italian'),('iu', 'Inuktitut'),
('ja', 'Japanese'),('jv', 'Javanese'),('kl', 'Kalaallisut'),('kn', 'Kannada'),
('kr', 'Kanuri'),('ks', 'Kashmiri'),('kk', 'Kazakh'),('km', 'Khmer'),('ki', 'Kikuyu, Gikuyu'),
('rw', 'Kinyarwanda'),('ky', 'Kirghiz, Kyrgyz'),('kv', 'Komi'),('kg', 'Kongo'),('ko', 'Korean'),
('ku', 'Kurdish'),('kj', 'Kwanyama, Kuanyama'),('la', 'Latin'),('lb', 'Luxembourgish'),('lg', 'Luganda'),
('li', 'Limburgish'),('ln', 'Lingala'),('lo', 'Lao'),('lt', 'Lithuanian'),('lu', 'Luba-Katanga'),
('lv', 'Latvian'),('gv', 'Manx'),('mk', 'Macedonian'),('mg', 'Malagasy'),('ms', 'Malay'),
('ml', 'Malayalam'),('mt', 'Maltese'),('mi', 'Māori'),('mr', 'Marathi (Marāṭhī)'),('mh', 'Marshallese'),
('mn', 'Mongolian'),('na', 'Nauru'),('nv', 'Navajo, Navaho'),('nb', 'Norwegian Bokmål'),('nd', 'North Ndebele'),
('ne', 'Nepali'),('ng', 'Ndonga'),('nn', 'Norwegian Nynorsk'),('no', 'Norwegian'),('ii', 'Nuosu'),
('nr', 'South Ndebele'),('oc', 'Occitan'),('oj', 'Ojibwe, Ojibwa'),('cu', 'Old Church Slavonic'),('om', 'Oromo'),
('or', 'Oriya'),('os', 'Ossetian, Ossetic'),('pa', 'Panjabi, Punjabi'),('pi', 'Pāli'),('fa', 'Persian'),
('pl', 'Polish'),('ps', 'Pashto, Pushto'),('pt', 'Portuguese'),('qu', 'Quechua'),('rm', 'Romansh'),
('rn', 'Kirundi'),('ro', 'Romanian, Moldavan'),('ru', 'Russian'),('sa', 'Sanskrit (Saṁskṛta)'),('sc', 'Sardinian'),
('sd', 'Sindhi'),('se', 'Northern Sami'),('sm', 'Samoan'),('sg', 'Sango'),('sr', 'Serbian'),
('gd', 'Scottish Gaelic'),('sn', 'Shona'),('si', 'Sinhala, Sinhalese'),('sk', 'Slovak'),('sl', 'Slovene'),
('so', 'Somali'),('st', 'Southern Sotho'),('es', 'Spanish; Castilian'),('su', 'Sundanese'),('sw', 'Swahili'),('ss', 'Swati'),
('sv', 'Swedish'),('ta', 'Tamil'),('te', 'Telugu'),('tg', 'Tajik'),('th', 'Thai'),('ti', 'Tigrinya'),
('bo', 'Tibetan'),('tk', 'Turkmen'),('tl', 'Tagalog'),('tn', 'Tswana'),('to', 'Tonga'),('tr', 'Turkish'),('ts', 'Tsonga'),
('tt', 'Tatar'),('tw', 'Twi'),('ty', 'Tahitian'),('ug', 'Uighur, Uyghur'),('uk', 'Ukrainian'),('ur', 'Urdu'),
('uz', 'Uzbek'),('ve', 'Venda'),('vi', 'Vietnamese'),('vo', 'Volapük'),('wa', 'Walloon'),('cy', 'Welsh'),('wo', 'Wolof'),
('fy', 'Western Frisian'),('xh', 'Xhosa'),('yi', 'Yiddish'),('yo', 'Yoruba'),('za', 'Zhuang, Chuang'),('zu', 'Zulu'),]
LANG_CODES = dict(iso_639_choices)
LANG_CODES['und'] = 'undefined'

    


#Dictionary of weather related terms
str_weather_terms = '''aerovane air airstream altocumulus altostratus anemometer anemometers anticyclone anticyclones \
arctic arid aridity atmosphere atmospheric autumn autumnal balmy baroclinic barometer barometers \
barometric blizzard blizzards blustering blustery blustery breeze breezes breezy brisk calm \
celsius chill chilled chillier chilliest chilly chinook cirrocumulus cirrostratus cirrus climate climates \
cloud cloudburst cloudbursts cloudier cloudiest clouds cloudy cold colder coldest condensation \
contrail contrails cool cooled cooling cools cumulonimbus cumulus cyclone cyclones damp damp \
damper damper dampest dampest degree degrees deluge dew dews dewy doppler downburst \
downbursts downdraft downdrafts downpour downpours dried drier dries driest drizzle drizzled \
drizzles drizzly drought droughts dry dryline fall farenheit flood flooded flooding floods flurries \
flurry fog fogbow fogbows fogged fogging foggy fogs forecast forecasted forecasting forecasts freeze \
freezes freezing frigid frost frostier frostiest frosts frosty froze frozen gale gales galoshes gust \
gusting gusts gusty haboob haboobs hail hailed hailing hails haze hazes hazy heat heated heating \
heats hoarfrost hot hotter hottest humid humidity hurricane hurricanes ice iced ices icing icy \
inclement landspout landspouts lightning lightnings macroburst macrobursts maelstrom mercury \
meteorologic meteorologist meteorologists meteorology microburst microbursts microclimate \
microclimates millibar millibars mist misted mists misty moist moisture monsoon monsoons \
mugginess muggy nexrad nippy NOAA nor’easter nor’easters noreaster noreasters overcast ozone \
parched parching pollen precipitate precipitated precipitates precipitating precipitation psychrometer \
radar rain rainboots rainbow rainbows raincoat raincoats rained rainfall rainier rainiest \
raining rains rainy sandstorm sandstorms scorcher scorching searing shower showering showers \
skiff sleet slicker slickers slush slushy smog smoggier smoggiest smoggy snow snowed snowier \
snowiest snowing snowmageddon snowpocalypse snows snowy spring sprinkle sprinkles sprinkling \
squall squalls squally storm stormed stormier stormiest storming storms stormy stratocumulus \
stratus subtropical summer summery sun sunnier sunniest sunny temperate temperature tempest \
thaw thawed thawing thaws thermometer thunder thundered thundering thunders thunderstorm \
thunderstorms tornadic tornado tornadoes tropical troposphere tsunami turbulent twister twisters \
typhoon typhoons umbrella umbrellas vane warm warmed warming warms warmth waterspout \
waterspouts weather wet wetter wettest wind windchill windchills windier windiest windspeed \
windy winter wintery wintry'''
LST_WEATHER_TERMS = str_weather_terms.split(' ')
DICT_WEATHER_TERMS = {LST_WEATHER_TERMS[i]: 1 for i in range(len(LST_WEATHER_TERMS))}




# Part One: Raw to English Text 

## Social Media Pull
### Sina Weibo

In [6]:
#get database connection
def getConnection():

    # Connect to the database
    connection = pymysql.connect(host='10.105.131.16',
                                 user='zliu',
                                 password='climateshock',
                                 db='bj_weibo_shaohu',
                                 charset='utf8mb4',
                                 cursorclass=pymysql.cursors.DictCursor)
    return connection

In [7]:
def pullWeiboSamples():
    connection = getConnection()
    try:

        with connection.cursor() as cursor:
            # Read a single record
            sql = "SELECT `text`,`created_at_int`,`latitude`,`longitude` FROM `travel_poi_users_weibodata_beijing` limit 10;"
            cursor.execute(sql)
            result = cursor.fetchall()
            print(result)
    finally:
        connection.close()
#demo run to pull some records
#pullWeiboSamples()

### Twitter

Alternative choice:https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/5QCCUU&version=1.1

**Extract Tweets if needed**

In [5]:
# import tarfile
# tar = tarfile.open("western_europe_2012.tar.gz")
# tar.extractall()
# tar.close()

**Text cleaning**

In [8]:
def keepemoji_clean(text):
    '''
    clean text with tweet-preprocessor
    '''
    text = emoji.demojize(text)
    return p.clean(text)


**Basic Stat**

first count characters need to translate

In [None]:
def MapLang(code):
    '''
    
    '''
    if code in LANG_CODES:
        return LANG_CODES[code]
    else:
        return 'Unknown'

def count_clean_char(f):
    df = pd.read_json(f, orient='records')
    if df.size == 0:
        print('empty: {}'.format(f))
        return 0,0,None
    foreign_df= df[df['fastText_lang']!='en']
    foreign_df['clean_text'] = foreign_df['text'].map(lambda x:keepemoji_clean(x))
    char_count = foreign_df['clean_text'].str.count('.')
    return char_count.sum(),df.shape[0],df['fastText_lang']

def calculateStat(count_path):
    tweetfiles = glob.glob(count_path+'*.json')
    total_char = 0.0
    total_file = 0
    total_records = 0
    lang_col=[]
    for f in tweetfiles:
        m_count_char,m_records,m_lang_col = count_clean_char(f)
        total_char += m_count_char
        total_file += 1 if m_count_char > 0 else 0
        total_records += m_records
        if m_lang_col is not None:
            lang_col.append(m_lang_col)
        avg_char = total_char/total_file
    lang_all = pd.concat(lang_col)
    print(avg_char)
    print(total_char)
    print(total_records)
    sns.countplot(lang_all)
    count_table = pd.DataFrame(lang_all.value_counts()[:20])
    count_table['lang_code'] = count_table.apply(lambda x:x.index)
    count_table['lang_name'] = count_table['lang_code'].map(lambda x: MapLang(x))
    count_table['portion'] = count_table['fastText_lang']/total_records
    count_table['portion'].sum()
    return count_table

### Azure Translator

In [251]:
subscription_key = AZURE_KEY
endpoint = 'https://api-nam.cognitive.microsofttranslator.com/'
path = '/translate?api-version=3.0'
def translate_to_en(ori_text):
    '''
    Translate origin foreign text to English
    Return: A tuple (translated text, detected origin language, score for language detection)
    '''
    if len(ori_text) == 0:
        return ('','',0.0)
    params = '&to=en'
    constructed_url = endpoint + path + params
    headers = {
    'Ocp-Apim-Subscription-Key': subscription_key,
    'Ocp-Apim-Subscription-Region':'eastus2',
    'Content-type': 'application/json',
    'X-ClientTraceId': str(uuid.uuid4())
    }
    body = [{
    'text': ori_text
    }]
    request = requests.post(constructed_url, headers=headers, json=body)
    response = request.json()
    if response[0]:
        return response[0]['translations'][0]['text'],\
    response[0]['detectedLanguage']['language'],\
    response[0]['detectedLanguage']['score']
    else:
        return ('','',0.0)
#unit test
#translate_to_en('你好世界！')

def translate_df_on_lang(x):
    '''
    attach translated text, detected language, and confidence score to prigin dataframe
    '''
    if x['fastText_lang'] == 'en':
        x['en_text'] = x['clean_text']
        x['detect_lang'] = 'en'
        x['google_trans'] = ''
        x['lang_score'] = 1.0
        
    else:
        res = translate_to_en(x['clean_text'])
        x['en_text'] = res[0]
        try:
            x['google_trans'] = googleTrans(x['clean_text'])
        except:
            print('Error google trans: {}'.format(x['clean_text']))
            x['google_trans'] = ''
        x['detect_lang'] = res[1]
        x['lang_score'] = res[2]
    return x

#caution: run this function will use api budget of Azure!!
def run_translation(df):
    df= df.apply(
        translate_df_on_lang, 
        axis=1) 
    return df

### Google Translator

In [9]:
# Object for Google translation    
TRANSLATOR = Translator()

def googleTrans(text):
    translated = TRANSLATOR.translate(text)
    return translated.text
#unit test
# googleTrans('Gute Morgen')

NameError: name 'Translator' is not defined

## Weather term identification

add Weather related term

In [4]:
def CheckWeatherTerm(text):
    '''
    Return 1 or 0 for whether input contains any weather term
    '''
    words = nltk.word_tokenize(text)
    for w in words:
        if w in DICT_WEATHER_TERMS:
            print(w)
            return 1
    return 0

#unit test
#CheckWeatherTerm('A good weather!')

# Part Two: Compute Sentiment Score

### WKWSCI sentiment

In [10]:
lemmatizer = WordNetLemmatizer()
def loadWkwsciDict():
    df = pd.read_csv(WKWSCI_PATH, sep='\t')
    wkwsci_dict = df.set_index(['term','POS'])['sentiment'].to_dict()
    return wkwsci_dict
wkwsci_dict = loadWkwsciDict()

def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

def WKWSCI_term_sentiment(wkwsci_dict,word,tag):
    if tag == wn.ADJ:
        pos = 'adj'
    elif tag == wn.NOUN:
        pos = 'n'
    elif tag == wn.VERB:
        pos = 'v'
    elif tag == wn.ADV:
        pos = 'adv'
    if (word,pos) in wkwsci_dict:
        return wkwsci_dict[(word,pos)]
    else:
        return 0
    

def WKWSCI_polarity(text,wkwsci_dict):
    """
    Return a sentiment polarity
    """
 
    sentiment = 0.0
    tokens_count = 0
 
    #text = clean_text(text)
 
 
    raw_sentences = sent_tokenize(text)
    for raw_sentence in raw_sentences:

        tagged_sentence = pos_tag(word_tokenize(raw_sentence))
     
        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)

            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
 
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)

            if not lemma:
                continue
 
            synsets = wn.synsets(lemma, pos=wn_tag)

            if not synsets:
                continue
            synset = synsets[0]
            senti_word = synset.name().split('.')[0]
            sentiment_score = WKWSCI_term_sentiment(wkwsci_dict,senti_word,wn_tag)
             
            sentiment += sentiment_score
            tokens_count += 1
 
    # judgment call ? Default to positive or negative
    if not tokens_count:
        return 0
 
    # sum greater than 0 => positive sentiment
    return 1.0*sentiment/tokens_count

#unittest
#WKWSCI_polarity('It is sad',wkwsci_dict)

def add_wkwsci(df):
    wkwsci_dict = loadWkwsciDict()
    df['wkwsci'] = df[TEXT_FIELD].map(lambda x:WKWSCI_polarity(x,wkwsci_dict))
    return df

### SentiWordNet

In [28]:
lemmatizer = WordNetLemmatizer()
def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

def swn_polarity(text):
    """
    Return a sentiment polarity: 0 = negative, 1 = positive
    """
 
    sentiment = 0.0
    tokens_count = 0

 
 
    raw_sentences = sent_tokenize(text)
    for raw_sentence in raw_sentences:

        tagged_sentence = pos_tag(word_tokenize(raw_sentence))
     
        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
 
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            #print('lemma: '+lemma)
            if not lemma:
                continue
            synsets = wn.synsets(lemma, pos=wn_tag)
            #print('synsets: '+str(synsets))
            if not synsets:
                continue
            # Take the first sense, the most common
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment += swn_synset.pos_score() - swn_synset.neg_score()
            tokens_count += 1
 
    # judgment call ? Default to positive or negative
    if not tokens_count:
        return 0
 
    # sum greater than 0 => positive sentiment
    return sentiment
#unit test
#swn_polarity('Nice job!') # 1 1

def add_swn(df):
    df['swn'] = df[TEXT_FIELD].map(lambda x:swn_polarity(x))
    return df

### Texeblob

In [331]:
def add_textblob(df):
    df['textblob'] = df[TEXT_FIELD].map(lambda x:TextBlob(x).sentiment.polarity)
    return df

### AFINN (Nielsen 2011)

In [330]:
def afinn_sentiment(text,afinn,pattern_split):
    """
    Returns a float for sentiment strength based on the input text.
    Positive values are positive valence, negative value are negative valence. 
    """
    words = pattern_split.split(text.lower())
    sentiments = list(map(lambda word: afinn.get(word, 0), words))
    leng = len(sentiments)
    if leng > 0:
        # How should you weight the individual word sentiments? 
        # You could do N, sqrt(N) or 1 for example. Here I use sqrt(N)
        sentiment = float(sum(list(sentiments)))/math.sqrt(len(list(sentiments)))
        
    else:
        sentiment = 0
    return sentiment

def add_afinn(df):
    filenameAFINN = AFINN_PATH
    afinn = dict(map(lambda ws: (ws[0], int(ws[1])), [ 
            ws.strip().split('\t') for ws in open(filenameAFINN) ]))
    pattern_split = re.compile(r"\W+")
    df['afinn'] = df[TEXT_FIELD].map(lambda x:afinn_sentiment(x,afinn,pattern_split))
    return df

### Hedonometer (Dodds and Danforth 2010)

In [329]:

def load_scores(filename):
    """Takes a file from the Dodd research paper and returns a dict of
    wordscores. Note this function is tailored to the file provided
    by the Dodd paper. For other sets of word scores, a dict can be
    passed directly to HMeter."""
    
    doddfile = csv.reader(open(filename, "r"), delimiter='\t')
    for x in range(4):  # strip header info
        next(doddfile)

    return {row[0]: float(row[2]) for row in doddfile}

class HMeter(object):
    """HMeter is the main class to prepare a text sample for scores. It
    expects a list of individual words, such as those provided by 
    nltk.word_tokenize, as wordlist. It expects a dict of words as k and
    floating point wordscores as v for wordscores. deltah allows us to 
    filter out the most neutral words as stop words."""

    def __init__(self, wordlist, wordscores, deltah=0.0):
        self.wordlist = wordlist
        self.wordscores = wordscores
        self.deltah = deltah

    _deltah = None
    @property
    def deltah(self):
        """Deltah determines stop words. The higher deltah the more neutral 
        words are are discarded from the matchlist."""
        return self._deltah

    @deltah.setter
    def deltah(self, deltah):
        """Each time deltah is set we need to regenerate the matchlist."""
        self._deltah = deltah
        # TODO Should probably raise a range error if deltah is nonsensical
        # first we take every word that matches labMT 1.0
        labmtmatches = (word for word in self.wordlist
                        if word in self.wordscores)

        # then we strip out stop words as described by Dodd paper
        self.matchlist = []
        for word in labmtmatches:
            score = self.wordscores[word]
            if score >= 5.0 + self.deltah or score <= 5.0 - self.deltah:
                self.matchlist.append(word)

    def fractional_abundance(self, word):
        """Takes a word and return its fractional abundance within
        self.matchlist"""
        frac_abund = self.matchlist.count(word) / len(self.matchlist)
        return frac_abund

    def word_shift(self, comp):
        """Produces data necessary to create a word shift graph. Returns a list 
        of tuples that contain each word's contribution to happiness score shift 
        between two samples. So for example, assigned to a variable 'output_data'
        output_data[n] represents the data for one word where:
            
        output_data[n][0] the word
        output_data[n][1] the proportional contribution the word gives to overall
                          word shift
        output_data[n][2] The relative abundance of word between the two samples
        output_data[n][3] The word's happiness relative to the refernce sample
        
        Using this data, we can construct word shift graphs as described here:
        http://www.hedonometer.org/shifts.html"""

        # initialize variables for potentially large loop.
        # create our comparison object. self is the reference object.
        tcomp = HMeter(comp, self.deltah)

        # we want a list of all potential words, but only need each word once.
        word_shift_list = set(tcomp.matchlist + self.matchlist)

        output_data = []
        ref_happiness_score = self.happiness_score()
        comp_happiness_score = tcomp.happiness_score()
        happy_diff = comp_happiness_score - ref_happiness_score

        for word in word_shift_list:
            abundance = (tcomp.fractional_abundance(word) -
                         self.fractional_abundance(word))
            happiness_shift = self.wordscores[word] - ref_happiness_score
            paper_score = (happiness_shift * abundance * 100) / happy_diff
            output_data.append((word, paper_score, abundance, happiness_shift))

        # sort words by absolute value of individual word shift
        output_data.sort(key=lambda word: abs(word[1]))
        return output_data

    def happiness_score(self):
        """Takes a list made up of individual words and returns the happiness
        score."""

        happysum = 0
        count = len(self.matchlist)

        for word in self.matchlist:
            happysum += self.wordscores[word]

        if count != 0:  # divide by zero errors are sad.
            return happysum / count
        else:
            pass  # empty lists have no score
        
def hmeter_sentiment(text,pattern_split,scores):
    """
    Returns a float for sentiment strength based on the input text.
    Positive values are positive valence, negative value are negative valence. 
    """
    words = pattern_split.split(text.lower())
    h = HMeter(words,scores)
    return h.happiness_score()
# unittest
# hmeter_sentiment('VADER is not smart, handsome, nor funny')

def add_hedono(df):
    scores = load_scores(HEDONO_PATH)
    pattern_split = re.compile(r"\W+")
    df['hedono'] = df[TEXT_FIELD].map(lambda x:hmeter_sentiment(x,pattern_split,scores))
    return df

### VADER (Gilbert and Hutto 2014)

In [328]:
def add_vader(df):
    analyzer = SentimentIntensityAnalyzer()
    df['vader'] = df[TEXT_FIELD].map(lambda x:analyzer.polarity_scores(x)['compound'])
    return df
#unittest
# vs = analyzer.polarity_scores("VADER is VERY SMART, handsome, and FUNNY.")
# vs

### LIWC(Pennebaker et al. 2015)

Commercial software. To be included

### Combination

In [1]:
def add_polarity_weighted(df,weights = 1):
    return df

In [2]:
def add_all_sentiment(df):
    '''
    calculate sentiment scores for field TEXT_FIELD
    '''
    df = add_afinn(df)
    df = add_textblob(df)
    df = add_hedono(df)
    df = add_vader(df)
    df = add_swn(df)
    df = add_wkwsci(df)
    df = add_polarity_weighted(df)
    return df

# Execuate All

In [None]:
def process_all_by_file(f):
    df = pd.read_json(f, orient='records')
    fn = f.split('/')[-1]
    if df.size == 0:
        print('empty: {}'.format(f))
        return -1
    df['lat'] = df['geo'].str['coordinates'].str[0]
    df['lng'] = df['geo'].str['coordinates'].str[1]


    df = df[['fastText_lang','id','text','tweet_created_at','lat','lng']]
    df['clean_text'] = df['text'].map(lambda x:keepemoji_clean(x))
    foreign_df= df[df['fastText_lang']!='en']
    char_count = foreign_df['clean_text'].str.count('.')
    df = run_translation(df)
    if 'en_text' in df:
        df['weather_term'] = df['en_text'].map(lambda x:CheckWeatherTerm(x))
    else:
        df['weather_term'] = df['clean_text'].map(lambda x:CheckWeatherTerm(x))
    df = add_all_sentiment(df)
    df = df.drop(columns=['text','clean_text'])
    print(savepath+fn+'.csv')
    df.to_csv(savepath+fn+'.csv',index=False)
    return char_count.sum()


In [None]:
# calculate statistics

import warnings
warnings.filterwarnings('ignore')


count_path = READ_PATH #india tweets
calculateStat(count_path)

In [None]:
filepath = READ_PATH #india tweets
savepath = SAVE_PATH

tweetfiles = glob.glob(filepath+'*.json')

used_char_count = 0
file_processed = 0
for f in tweetfiles[:
    ret = process_all_by_file(f)
    used_char_count+=ret if ret >= 0 else 0
    file_processed += 1 if ret >= 0 else 0
    print(used_char_count)
    