In [None]:
import numpy as np
import pandas as pd
import pyspark as ps
import matplotlib.pyplot as plt
import csv
import seaborn as sns
import unicodedata

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import pickle

In [None]:
def get_time_distribution(times):
    publishing_time = times.map(lambda x: x.hour).value_counts() / times.count()
    publishing_time.sort_index(inplace=True)
    return publishing_time

In [None]:
def plot_daily(depressed):
    neg = get_time_distribution(depressed[depressed.sentiment == 'NEGATIVE']['published'])
    neut = get_time_distribution(depressed[depressed.sentiment == 'NEUTRAL']['published'])
    pos = get_time_distribution(depressed[depressed.sentiment == 'POSITIVE']['published'])

    plt.plot(neg)
    plt.plot(neut)
    plt.plot(pos)
    plt.legend(['negative', 'neutral', 'positive'])
    plt.show()

In this notebook I look at the different temporal distribution.
First look at emotion over time:

In [None]:
df = pickle.load( open( "keyword_frames/english_keyworded_tweets.pkl", "rb" ))

In [None]:
df.head()

In [None]:
english = df[(~df['keywords'].isnull())]

In [None]:
english[english.keywords.map(lambda x: len(x) > 0)].count()

In [None]:
plot_daily(english)

In [None]:
sad = english[english.keywords.map(lambda x: len(x) > 0)]

In [None]:
get_time_distribution(sad.published).plot()
get_time_distribution(english.published).plot()#not usefull 
plt.legend(['sad', 'general'])
plt.show()

In [None]:
sad_people = sad.author_gender.value_counts() / sad.author_gender.count()
sad_people

In [None]:
all_people = english.author_gender.value_counts()/ english.author_gender.count()
all_people

In [None]:
all_people.iloc[1:].plot(kind='bar')
plt.figure()
sad_people.iloc[1:].plot(kind='bar')
plt.show()

In [None]:
sentiment_s = sad.sentiment.value_counts() / sad.sentiment.count()
sentiment_s

In [None]:
def get_yearly(times):
    publishing_time = times.map(lambda x: x.month).value_counts() / times.count()
#     publishing_time.sort_index(inplace=True)
    return publishing_time

In [None]:
total_y = get_yearly(df.published)
neg_y = get_yearly(df[df.sentiment == 'NEGATIVE']['published'])
neut_y = get_yearly(df[df.sentiment == 'NEUTRAL']['published'])
pos_y = get_yearly(df[df.sentiment == 'POSITIVE']['published'])

In [None]:
#the way spinner collected the tweets indicated that there is a difference in the frequency od tweets
sns.tsplot(data=neg_y, color='Purples')
sns.tsplot(data=pos_y, color='Reds')
sns.tsplot(data=neut_y)
plt.legend(['negative', 'positive', 'neutral'])
plt.show()

How the f are we supposed to deal with such an unequally distribued dataset????
-> we do this by working on the big dataset.

In [None]:
df.author_user_id.isnull().value_counts()

In [None]:
df.geo_point.isnull().value_counts()

In [None]:
df.source_location.isnull().value_counts()

In [None]:
import folium
from folium.plugins import MarkerCluster

In [None]:
def process_location_data(df):
    """preprocessed data in format as found on cluster:
    df: dataframe, locations should be in 'geo_point'
    """
    non_null = df[~df.geo_point.isnull()]
    virgule = non_null[non_null.geo_point.str.contains(',')].geo_point.str.split(pat=',', expand=True).applymap(float)
    no_virgule = non_null[~non_null.geo_point.str.contains(',')].geo_point.str.split(expand=True).applymap(float)
    return pd.concat([virgule, no_virgule])
    


def build_map(location):
    """build swiss location map
    location: dataframe, locations must of type [lat, long]'
    note: location should not contain more than 5000 locations, otherwise buggy
    """
    SWISS_COORD = [46.85, 8.23] #location of switzerland
    swiss_map = folium.Map(location = SWISS_COORD, zoom_start = 8, tiles = 'cartodbpositron')
    marker_cluster = MarkerCluster().add_to(swiss_map)
    for each in location.iterrows():
        folium.Marker([each[1][0],each[1][1]], ).add_to(marker_cluster)
    return swiss_map

In [None]:
locations = process_location_data(df)

In [None]:
swiss_map = build_map(location=locations.head(100))

In [None]:
swiss_map

In [None]:
aggregated_points = df.geo_point.value_counts().reset_index()
aggregated_points.columns = ['geo_point', 'count']

In [None]:
location = process_location_data(aggregated_points)

In [None]:
ag_map = build_map(location=location)

In [None]:
ag_map