#### Simple Approach - Semtiment Analysis
In this notebook, we apply a simple approach to analyse the sentiments of the tweets. we use a textblob library which has a support for 3 languages, English, French, and German.. the top three frequent languages in Switzerland.

We do a simpe cleaning befor using TextBlob Sentiment Analyzer, such as removing https links, hashtags and mentions.

In [1]:
import pandas as pd
from textblob import TextBlob
import numpy as np
import  csv
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline 
import re

In [2]:
tweet=pd.read_csv('tweets_with_canton_language_short.csv')

In [3]:
len(tweet)

12579034

In [6]:
## text cleaning

def clean_up_text(text):
    '''
    remove https links, hashtags and mentions
    '''
    hashtag = re.compile(r'[#]\w*')
    https = re.compile(r'https?:\/\/[a-zA-z0-9\/#%\.]+')
    mention = re.compile(r'[@]\w*')
    text = re.sub(hashtag, '', text)
    text = re.sub(https, '', text)
    text = re.sub(mention, '', text)
    return text


# English

In [7]:
tweet_en=tweet[tweet['language']=='en']

In [8]:
len(tweet_en)

2985994

In [10]:
text_list=tweet_en.text.tolist()

In [14]:
claned_text=[]
for t in text_list:
    claned_text.append(clean_up_text(t))
    

In [17]:
def get_sentiment_polarity(text):
    blob = TextBlob(text)
    return blob.polarity, blob.subjectivity


In [18]:
def study_sentiment(polarity_list, mid_point=0):
    '''
    The polarity score is a float within the range [-1.0, 1.0].
    The subjectivity is a float within the range [0.0, 1.0] 
    where 0.0 is very objective and 1.0 is very subjective.
    '''
    polarity_array = np.array(polarity_list)
    mean = np.mean(polarity_array)
    std = np.std(polarity_array)
    frac_positive = sum(polarity_array>mid_point)*1./len(polarity_array)
    frac_negative = sum(polarity_array<mid_point)*1./len(polarity_array)
    return mean, std, frac_positive,frac_negative

In [19]:
sentiment_analysis = map(get_sentiment_polarity, claned_text)
polarity_list = map(lambda x:x[0], sentiment_analysis)
sentiment_list = map(lambda x:x[1], sentiment_analysis)

In [None]:
po=[]
i=0
for a in polarity_list:
    #print(a)
    po.append(a)
    i+=1
    if (i%100000 == 0):
        print(i)

In [22]:
len(po)

2985994

In [23]:
len(tweet_en)

2985994

In [None]:
tweet_en['polarity']=po

# French

In [28]:
tweet_fr=tweet[tweet['language']=='fr']

In [29]:
len(tweet_fr)

3500272

In [30]:
from textblob import Blobber
from textblob_fr import PatternTagger, PatternAnalyzer
tb = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
blob1 = tb(u"Quelle belle matinÃ©e")
blob1.sentiment




(0.8, 0.8)

In [31]:
text_list=tweet_fr.text.tolist()
claned_text=[]
for t in text_list:
    claned_text.append(clean_up_text(t))
    

In [None]:
score=[]
i=0
for text in claned_text:  
    blob2 = tb(text)
    score.append(blob2.sentiment[0])
    i+=1
    if (i%100000 == 0):
        print(i)


In [None]:
tweet_fr['polarity']=score

# German

In [35]:
tweet_de=tweet[tweet['language']=='de']

In [36]:
len(tweet_de)

1402170

In [37]:
text_list=tweet_de.text.tolist()
claned_text=[]
for t in text_list:
    claned_text.append(clean_up_text(t))
    

In [38]:
from textblob_de import TextBlobDE as TextBlob

In [None]:
score=[]
i=0
for text in text_list:  
    blob = TextBlob(text)
    score.append(blob.sentiment[0])
    i+=1
    if (i%10000 == 0):
        print(i)


In [40]:
len(score)

1402170

In [None]:
tweet_de['polarity']=score

In [46]:
tweet_fr_en_de_polarity=pd.concat([tweet_en,tweet_fr,tweet_de])

In [49]:
tweet_fr_en_de_polarity=tweet_fr_en_de_polarity.reset_index()

In [51]:
len(tweet_fr_en_de_polarity)

7888436

In [52]:
tweet_fr_en_de_polarity.to_csv('tweet_fr_en_de_polarity.csv', index=None)