In [33]:
from pymongo import MongoClient
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import operator
from Levenshtein import *
import re
from tabulate import tabulate
import age_gender_predictor
%matplotlib inline


In [19]:
# Data Fetching, transformation and filtering


def getUsersTweets(dbName,collectionName):
    cursor = MongoClient("localhost", 27017)[dbName][collectionName].find()
    usersTweets = {}
    for tweet in cursor:
        userID = tweet["user"]["id"]
        #Processing emotions from Carlos' API
        emotion =  tweet["emotion"]["groups"][0]["name"]
        if len(tweet["emotion"]["groups"]) > 1:
            emotion_2 = tweet["emotion"]["groups"][1]["name"]
            
        ambiguous = True if tweet['emotion']['ambiguous'] == 'yes' else False
       
        if len(tweet["emotion"]["groups"]) > 1:
            emotion_2 = tweet["emotion"]["groups"][1]["name"]    
        else:
            emotion_2 = None
        if tweet["polarity"] == "positive":
            polarity = 1
        elif tweet["polarity"] == "negative":
            polarity = -1
        else:
            polarity = 0
   
            
        date = tweet["created_at"]
        text = tweet['text']

        if userID not in usersTweets:
            usersTweets[userID] = {}
        if date not in usersTweets[userID]:
            usersTweets[userID][date] = {}
            
        usersTweets[userID][date]['text'] = text
        usersTweets[userID][date]['polarity'] =  polarity
        usersTweets[userID][date]['emotion'] =  emotion
        usersTweets[userID][date]['emotion_2'] =  emotion_2
        usersTweets[userID][date]['ambiguous'] =  ambiguous
    return usersTweets



def timeSeriesTransform(usersEmotions):
    for userID in usersEmotions:
        usersEmotions[userID] = pd.DataFrame.from_dict(usersEmotions[userID], orient='index').fillna(0)
        usersEmotions[userID]['dt'] = np.zeros(usersEmotions[userID].shape[0],dtype=float)
        usersEmotions[userID].loc[:-1,'dt'] = (usersEmotions[userID].index[1:].values - usersEmotions[userID].index[:-1].values).astype('timedelta64[s]') / np.timedelta64(60, 's')
    return list(usersEmotions.values())


def getHTTPRows(timeSeries):
    count = 0
    patterns = ['http://','https://']
    conditions = timeSeries['text'].str.contains(patterns[0])
    for pattern in patterns[1:]:
        conditions = conditions | timeSeries['text'].str.contains(pattern)

    return conditions.values

def userFilter(group, spam_threshold=0.5,tweets_threshold=100):    #Spam and inactive user filter
    new_group = []
    for timeSeries in group:
        http_rows = getHTTPRows(timeSeries)
        average_http_count = np.sum(http_rows) / timeSeries.shape[0]
        if (average_http_count < spam_threshold) and (timeSeries.shape[0] > tweets_threshold):
            new_group.append(timeSeries)
    return new_group




In [48]:
BPD_tweets =  getUsersTweets("patients","BPD_clean")
regular_tweets =  getUsersTweets("idea","regularUser_en_fixed_emotion")
bipolar_tweets = getUsersTweets("patients","bipolar_clean")
mix_tweets = getUsersTweets("patients","bb_mix")



#Transform raw tweets into timeSeries data.
BPDtimeSeries = timeSeriesTransform(BPD_tweets) 
regular_timeSeries = timeSeriesTransform(regular_tweets)
bipolar_timeSeries = timeSeriesTransform(bipolar_tweets)
mix_timeSeries = timeSeriesTransform(mix_tweets)

#Filter and clean the timeSeries data
BPD_clean = userFilter(BPDtimeSeries)
regular_clean = userFilter(regular_timeSeries)
bipolar_clean = userFilter(bipolar_timeSeries)
mix_clean = userFilter(mix_timeSeries)


groups = [regular_clean,bipolar_clean, BPD_clean, mix_clean]
colors=['g','b', 'y','r']
group_names = ["Randpm Samples", "Bipolar", "BPD","Mix"]


In [54]:
#Statistics facts of experimental data

headers = ["Group","Users", "Tweets", "Average tweets"]
contents = []
for i, group in enumerate(groups):
    group_name = group_names[i]
    tweets_num = sum([timeSeries.shape[0] for timeSeries in group])
    users_num = len(group)
    average_tweets_num = tweets_num / users_num
    contents.append([group_name, users_num, tweets_num, average_tweets_num])
    
print(tabulate(contents, headers=headers))

Group             Users    Tweets    Average tweets
--------------  -------  --------  ----------------
Randpm Samples      548    796957           1454.3
Bipolar             146    182669           1251.16
BPD                 232    256110           1103.92
Mix                  46     62509           1358.89


In [53]:
#Age and gender Distribution
def getAge(timeSeries):
    texts = ""
    for text in timeSeries["text"].values:
        texts += text + "\n"np.mean(users_gender)
    return age_gender_predictor.get_age(texts)

def getGender(timeSeries):
    texts = ""
    for text in timeSeries["text"].values:
        texts += text + "\n"
    return age_gender_predictor.get_gender(texts)





headers = ["Group","Male users", "Female users", "Average age"]
contents = []
for i, group in enumerate(groups):
    group_name = group_names[i]
    users_num = len(group)
    users_gender = [getGender(timeSeries) for timeSeries in group]
    users_age = [getAge(timeSeries) for timeSeries in group]
    male_ratio = len([gender for gender in users_gender if gender < 0]) / users_num
    female_ratio = 1 - male_ratio
    average_age = sum(users_age) / users_num
    contents.append([group_name, male_ratio, female_ratio, average_age])
print(tabulate(contents, headers=headers))

def renderAgeDistribution(groups, bins=100):
    for g,group in enumerate(groups):
        ages = np.zeros(len(group),dtype=float)
        for i, timeSeries in enumerate(group):
            timeSeries = cleanPost(timeSeries)
            if method is None:
                http_count = f(timeSeries)
            else:

                http_count = f(timeSeries,method)
            average_count = np.sum(http_count) / timeSeries.shape[0]
            tweets_length[i] = timeSeries.shape[0]
            counts[i] =  average_count
        counts = counts[np.isfinite(counts)]
        counts = counts[(counts>lowerbound) & (counts<upperbound)]
        counts = reject_outliers(counts)
        plt.hist(counts,color=colors[g], bins = bins, edgecolor='none' )
        plt.ylabel('people')
        plt.xlabel('specific post / total post ')
        plt.title(str(len(group)) + " " + legends[g] + " people")
        plt.show()
        print("Average size of total tweets: {} std:".format(np.mean(tweets_length),np.std(tweets_length)))
        print(" Mean:{0:2f} STD:{1:2f}".format(np.mean(counts),np.std(counts)))
    

Group             Male users    Female users    Average age
--------------  ------------  --------------  -------------
Randpm Samples      0.427007        0.572993        25.3958
Bipolar             0.19863         0.80137         34.6311
BPD                 0.116379        0.883621        30.9009
Mix                 0.130435        0.869565        33.4076
