In [None]:
from pymongo import MongoClient
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import operator
from Levenshtein import *
import re
from tabulate import tabulate
import age_gender_predictor
%matplotlib inline


In [19]:
# Data Fetching, transformation and filtering


def getUsersTweets(dbName,collectionName):
    cursor = MongoClient("localhost", 27017)[dbName][collectionName].find()
    usersTweets = {}
    for tweet in cursor:
        userID = tweet["user"]["id"]
        #Processing emotions from Carlos' API
        emotion =  tweet["emotion"]["groups"][0]["name"]
        if len(tweet["emotion"]["groups"]) > 1:
            emotion_2 = tweet["emotion"]["groups"][1]["name"]
            
        ambiguous = True if tweet['emotion']['ambiguous'] == 'yes' else False
       
        if len(tweet["emotion"]["groups"]) > 1:
            emotion_2 = tweet["emotion"]["groups"][1]["name"]    
        else:
            emotion_2 = None
        if tweet["polarity"] == "positive":
            polarity = 1
        elif tweet["polarity"] == "negative":
            polarity = -1
        else:
            polarity = 0
   
            
        date = tweet["created_at"]
        text = tweet['text']

        if userID not in usersTweets:
            usersTweets[userID] = {}
        if date not in usersTweets[userID]:
            usersTweets[userID][date] = {}
            
        usersTweets[userID][date]['text'] = text
        usersTweets[userID][date]['polarity'] =  polarity
        usersTweets[userID][date]['emotion'] =  emotion
        usersTweets[userID][date]['emotion_2'] =  emotion_2
        usersTweets[userID][date]['ambiguous'] =  ambiguous
    return usersTweets



def timeSeriesTransform(usersEmotions):
    for userID in usersEmotions:
        usersEmotions[userID] = pd.DataFrame.from_dict(usersEmotions[userID], orient='index').fillna(0)
        usersEmotions[userID]['dt'] = np.zeros(usersEmotions[userID].shape[0],dtype=float)
        usersEmotions[userID].loc[:-1,'dt'] = (usersEmotions[userID].index[1:].values - usersEmotions[userID].index[:-1].values).astype('timedelta64[s]') / np.timedelta64(60, 's')
    return list(usersEmotions.values())


def getHTTPRows(timeSeries):
    count = 0
    patterns = ['http://','https://']
    conditions = timeSeries['text'].str.contains(patterns[0])
    for pattern in patterns[1:]:
        conditions = conditions | timeSeries['text'].str.contains(pattern)

    return conditions.values

def userFilter(group, spam_threshold=0.5,tweets_threshold=100):    #Spam and inactive user filter
    new_group = []
    for timeSeries in group:
        http_rows = getHTTPRows(timeSeries)
        average_http_count = np.sum(http_rows) / timeSeries.shape[0]
        if (average_http_count < spam_threshold) and (timeSeries.shape[0] > tweets_threshold):
            new_group.append(timeSeries)
    return new_group




In [None]:
BPD_tweets =  getUsersTweets("patients","BPD_clean")
regular_tweets =  getUsersTweets("idea","regularUser_en_fixed_emotion")
bipolar_tweets = getUsersTweets("patients","bipolar_clean")
mix_tweets = getUsersTweets("patients","bb_mix")



#Transform raw tweets into timeSeries data.
BPDtimeSeries = timeSeriesTransform(BPD_tweets) 
regular_timeSeries = timeSeriesTransform(regular_tweets)
bipolar_timeSeries = timeSeriesTransform(bipolar_tweets)
mix_timeSeries = timeSeriesTransform(mix_tweets)

#Filter and clean the timeSeries data
BPD_clean = userFilter(BPDtimeSeries)
regular_clean = userFilter(regular_timeSeries), len(BPD_clean)
bipolar_clean = userFilter(bipolar_timeSeries)
mix_clean = userFilter(mix_timeSeries)


groups = [regular_clean,bipolar_clean, BPD_clean, mix_clean]
colors=['g','b', 'y','r']
group_names = ["Randpm Samples", "Bipolar", "BPD","Mix"]


In [30]:
#Statistics facts of experimental data

headers = ["Group","Users", "Tweets", "Average tweets"]
contents = []
for i, group in enumerate(groups):
    group_name = group_names[i]
    tweets_num = sum([timeSeries.shape[0] for timeSeries in group])
    users_num = len(group)
    average_tweets_num = tweets_num / users_num
    contents.append([group_name, users_num, tweets_num, average_tweets_num])
    
print(tabulate(contents, headers=headers))

Group             Users    Tweets    Average tweets
--------------  -------  --------  ----------------
Randpm Samples      232    328121           1414.31
Bipolar             146    182669           1251.16
BPD                 232    256110           1103.92
Mix                  46     62509           1358.89


In [None]:
#Age and gender Distribution
def getAge(timeSeries):
    texts = ""
    for text in timeSeries["text"].values:
        texts += text + "\n"
    return age_gender_predictor.get_age(texts)

def getGender(timeSeries):
    texts = ""
    for text in timeSeries["text"].values:
        texts += text + "\n"
    return age_gender_predictor.get_gender(texts)



def renderPieChart(groups,legends = ["BPD", "Normal"]):
    colorTable = {"joy":"#FADB4D","sadness": "#729DC9","fear":"#35A450", "anticipation": "#F2993A", "anger":"#E43054", "trust": "#99CC33", "disgust": "#9F78BA" ,"surprise" : "#3FA5C0"}
    emotions = ['surprise', 'fear', 'sadness', 'disgust', 'trust', 'anticipation', 'anger','joy']
    colors = [colorTable[emotion] for emotion in emotions]
    
    for g, group in enumerate(groups):
        emotion_means = {}
        for emotion in emotions:
            emotion_means[emotion] = [] 
        for timeSeries in group:
            if timeSeries.shape[0] < 100:
                continue
            timeSeries = cleanPost(timeSeries)
            summary = timeSeries.mean()
            for emotion in emotions:
                emotion_means[emotion].append(summary[emotion])
            
        for emotion in emotions:  
            emotion_means[emotion] = np.array(emotion_means[emotion])
            emotion_means[emotion] = emotion_means[emotion][np.isfinite(emotion_means[emotion])]

            
        emotion_means_np = [np.mean(emotion_means[emotion]) for emotion in emotions]
            
        #return emotion_means
        plt.title("{} people".format(legends[g]))
        plt.pie(emotion_means_np, labels=emotions, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90)
        plt.figure(figsize=(60,60))
        plt.show()
        #return emotion_means
        for emotion in emotions:
            print("{} std: {}".format(emotion, np.std(emotion_means[emotion])))



headers = ["Group","Male users", "Female users", "Average age"]
contents = []
for i, group in enumerate(groups):
    group_name = group_names[i]
    tweets_num = sum([timeSeries.shape[0] for timeSeries in group])
    users_num = len(group)
    average_tweets_num = tweets_num / users_num
    contents.append([group_name, users_num, tweets_num, average_tweets_num])
    