In [1]:
from pymongo import MongoClient
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import operator
from scipy.spatial import distance
from Levenshtein import *
import re
%matplotlib inline
import age_gender_predictor

In [41]:
#Preprocessing functions:



def stringConditionProcess(group, method = "first"):
    result = []
    for timeSeries in group:
        conditions = seriesContains(timeSeries, method)
        result.append(timeSeries[conditions])
    return result

def getUsersPolarities(dbName,collectionName):
    collection = MongoClient("localhost", 27017)[dbName][collectionName]
    usersPolarties = {}
    for tweet in collection.find():
        userID = tweet["user"]["id"]
        if tweet["polarity"] == "positive":
            polarity = 1
        elif tweet["polarity"] == "negative":
            polarity = -1
        else:
            polarity = 0
   
            
        date = tweet["created_at"]
        text = tweet['text']

        if userID not in usersPolarties:
            usersPolarties[userID] = {}
        if date not in usersPolarties[userID]:
            usersPolarties[userID][date] = {}
        usersPolarties[userID][date]['text'] = text
        usersPolarties[userID][date]['polarity'] =  polarity


    return usersPolarties


def timeSeriesTransform(usersEmotions):
    for userID in usersEmotions:
        usersEmotions[userID] = pd.DataFrame.from_dict(usersEmotions[userID], orient='index').fillna(0)
        usersEmotions[userID]['dt'] = np.zeros(usersEmotions[userID].shape[0],dtype=float)
        usersEmotions[userID].loc[:-1,'dt'] = (usersEmotions[userID].index[1:].values - usersEmotions[userID].index[:-1].values).astype('timedelta64[s]') / np.timedelta64(60, 's')
    return list(usersEmotions.values())


def disambiguate(timeSeries):
    conditions = np.logical_not(timeSeries['ambiguous'].values)
    timeSeries = timeSeries[conditions]
    timeSeries['dt'] = np.zeros(timeSeries.shape[0],dtype=float)
    timeSeries.loc[:-1,'dt'] = (timeSeries.index[1:].values - timeSeries.index[:-1].values).astype('timedelta64[s]') / np.timedelta64(60, 's')
    return timeSeries
        
def invalid_removal(data):
    data = np.array(data)
    return data[np.isfinite(data)]
    
def reject_outliers(data, m=2):
    data = np.array(data)
    data = data[np.isfinite(data)]
    return data[abs(data - np.mean(data)) < m * np.std(data)]        
        
        
    

In [3]:
#computational functions

def getAge(timeSeries):
    texts = ""
    for text in timeSeries["text"].values:
        texts += text + "\n"
    return age_gender_predictor.get_age(texts)

def getGender(timeSeries):
    texts = ""
    for text in timeSeries["text"].values:
        texts += text + "\n"
    return age_gender_predictor.get_gender(texts)
def negative_counter(timeSeries):
    return timeSeries["polarity"].values == -1
def positive_counter(timeSeries):
    return timeSeries["polarity"].values == 1

def comboTracker(timeSeries, attribute= "polarity"):
    array = timeSeries[attribute]
    starter = array[0]
    combo = 1
    result = []
    for cursor in array[1:]:
        if starter == cursor:
            combo += 1
        else:
            if combo > 1:
                result.append((starter, combo))
            starter = cursor
            combo = 1
    if combo > 1:
         result.append((starter, combo))
    return result





def seriesContains(timeSeries,method ="first"):
    if method == "first":
        match_function = np.vectorize(firstPronuonDetect)
    elif method == "second":
        match_function = np.vectorize(secondPronuonDetect)
    elif method == "third":
            match_function = np.vectorize(thirdPronuonDetect)


    return match_function(timeSeries["text"].str.lower().str.split().values)
    


def thirdPronuonDetect(words, matcher=re.compile("@[a-z]+")):
    for word in words:
        if word == "@":
            continue
        elif matcher.search(word):
            return True
    return False
    
    
def secondPronuonDetect(words, matchers=["you"]):
    for matcher in matchers:
        if matcher in words:
            return True
    return False
    


def firstPronuonDetect(words, matchers=["i","we","i'd","i'm"]):
    for matcher in matchers:
        if matcher in words:
            return True
    return False
    

def getFlipsDurationMean(timeSeries, upperbound=np.inf, lowerbound=0):
    flips = getFlips(timeSeries)
    durations = getFlipsDuration(timeSeries, flips)
    durations = durations[np.isfinite(durations)]
    durations = durations[(durations > lowerbound) & (durations < upperbound)]
    return np.mean(durations)
    
    
    
def getFlipsDuration(timeSeries, flips):
    timeSeries = timeSeries[flips]
    timeSeries.loc[:,'dt'] = np.zeros(timeSeries.shape[0],dtype=float)
    timeSeries.loc[:-1,'dt'] = (timeSeries.index[1:].values - timeSeries.index[:-1].values).astype('timedelta64[s]') / np.timedelta64(60, 's')
    return timeSeries['dt'][:-1].values



def getFlips(timeSeries, attribute= 'polarity'):
    flips = np.zeros(timeSeries.shape[0],dtype=bool)
    polarity = timeSeries[attribute].values[:-1]
    right_elements = timeSeries[attribute].values[1:]
    flips[:-1] = (polarity * right_elements) < 0
    return flips


def userVerify(timeSeries, threshold = 0.5):
    http_rows = getHTTPRows(timeSeries)
    average_http_count = np.sum(http_rows) / timeSeries.shape[0]
    return (average_http_count < threshold) & (timeSeries.shape[0] > 100)
 

def groupFilter(group):
    new_group = []
    for timeSeries in group:
        if userVerify(timeSeries):
            new_group.append(timeSeries)
    return new_group



def cleanPost(timeSeries):
    left_text = timeSeries['text'].values[:-1]
    right_text = timeSeries['text'].values[1:]
    conditions = np.ones(timeSeries.shape[0],dtype=bool)
    edit_distance = np.vectorize(distance)
    conditions[:-1] =  conditions[:-1] & (edit_distance(left_text, right_text) > 5)
    patterns = ['http://','https://']
    
    for pattern in patterns:
        conditions = conditions & np.logical_not(timeSeries['text'].str.contains(pattern).values)
    timeSeries = timeSeries[conditions]
    timeSeries.loc[:,'dt'] = np.zeros(timeSeries.shape[0],dtype=float)
    timeSeries.loc[:-1,'dt'] = (timeSeries.index[1:].values - timeSeries.index[:-1].values).astype('timedelta64[s]') / np.timedelta64(60, 's')

    return timeSeries

def getHTTPRows(timeSeries):
    count = 0
    patterns = ['http://','https://']
    conditions = timeSeries['text'].str.contains(patterns[0])
    for pattern in patterns[1:]:
        conditions = conditions | timeSeries['text'].str.contains(pattern)

    return conditions.values
    

In [4]:
#plotting functions 

def agePlot(groups, bins=100,legends = ["BPD", "Normal"], colors =['red', 'green'], lowerbound = 0,upperbound = 100):
    for g,group in enumerate(groups):
        ages = np.zeros(len(group),dtype=float)
        for i, timeSeries in enumerate(group):
            timeSeries = cleanPost(timeSeries)
            age = getAge(timeSeries)
            ages[i] = age
        plt.hist(ages,color=colors[g], bins = bins, edgecolor='none' )
        plt.ylabel('User Count')
        plt.xlabel('Age(years)')
        plt.title(legends[g])
        plt.show()
        print("Age mean:{0:2f} STD:{1:2f}".format(np.mean(ages),np.std(ages)))
 
def genderPlot(groups, bins=100,legends = ["BPD", "Normal"], colors =['red', 'green'], lowerbound = 0,upperbound = 100):
    for g,group in enumerate(groups):
        ages = np.zeros(len(group),dtype=float)
        for i, timeSeries in enumerate(group):
            timeSeries = cleanPost(timeSeries)
            gender = getGender(timeSeries)
            ages[i] = 1 if gender >0 else -1
        plt.hist(ages,color=colors[g], bins = bins, edgecolor='none' )
        plt.ylabel('User Count')
        plt.xlabel('Age(years)')
        plt.title(legends[g])
        plt.show()
        print("Age mean:{0:2f} STD:{1:2f}".format(np.mean(ages),np.std(ages)))       
        
        
def comboPlotPerUser(groups, bins=100,legends = ["BPD", "Normal"], colors =['red', 'green'], lowerbound = 2,upperbound = 100,element_value=-1):
    for g,group in enumerate(groups):
        tweets_length = np.zeros(len(group),dtype=int)
        combos_average = np.zeros(len(group),dtype=float)
        for i,timeSeries in enumerate(group):
            timeSeries = cleanPost(timeSeries)
            combo = comboTracker(timeSeries)
            filtered_combo = [hit for element, hit in combo if hit > lowerbound and hit < upperbound and element == element_value]
            tweets_length[i] = timeSeries.shape[0]
            combos_average[i] = sum(filtered_combo) / tweets_length[i]
            
        plt.hist(combos_average,color=colors[g], bins = bins, edgecolor='none' )
        plt.ylabel('Post count')
        plt.xlabel('Time (mins)')
        plt.title(str(len(group)) + " "+legends[g] + " people")
        plt.show()
        print("Average size of total tweets: {} std:".format(np.mean(tweets_length),np.std(tweets_length)))
        print("Average combo mean:{0:2f} STD:{1:2f}".format(np.mean(combos_average),np.std(combos_average)))
        
        
def comboPlot(groups, bins=100,legends = ["BPD", "Normal"], colors =['red', 'green'], lowerbound = 0,upperbound = 100, element_value=-1):
    for g,group in enumerate(groups):
        tweets_length = np.zeros(len(group),dtype=int)
        combos = []
        for i,timeSeries in enumerate(group):
            timeSeries = cleanPost(timeSeries)
            combo = comboTracker(timeSeries)
            tweets_length[i] = timeSeries.shape[0]
            combos += [hit for element, hit in combo if hit > lowerbound and hit < upperbound and element == element_value]
        plt.hist(combos,color=colors[g], bins = bins, edgecolor='none' )
        plt.ylabel('Post count')
        plt.xlabel('Combos')
        plt.title(str(len(group)) + " "+legends[g] + " people")
        plt.show()
        print("Average size of total tweets: {} std:".format(np.mean(tweets_length),np.std(tweets_length)))
        print("Combo mean:{0:2f} STD:{1:2f}".format(np.mean(combos),np.std(combos)))

def flipPlotPerUser(groups, bins=100,legends = ["BPD", "Normal"], colors =['red', 'green'], lowerbound = 0,upperbound = 100):
    print("Flip Durations of each flip")
    for g,group in enumerate(groups):
        tweets_length = np.zeros(len(group),dtype=int)
        flips_count = np.zeros(len(group),dtype=float)

        for i,timeSeries in enumerate(group):
            timeSeries = cleanPost(timeSeries)
            flips = getFlips(timeSeries)
            delta_times = getFlipsDuration(timeSeries, flips)
            tweets_length[i] = timeSeries.shape[0]
            flips_count[i] = np.sum((delta_times < upperbound) & (delta_times > lowerbound)) / tweets_length[i]
        plt.hist(flips_count,color=colors[g], bins = bins, edgecolor='none' )
        plt.ylabel('People')
        plt.xlabel('Flips / Tweets')
        plt.title(str(len(group)) + " "+legends[g] + " people")
        plt.show()
        print("Average size of total tweets: {} std:".format(np.mean(tweets_length),np.std(tweets_length)))
        print("Flips count mean:{0:2f} STD:{1:2f}".format(np.mean(flips_count),np.std(flips_count)))


def countPlotPerUser(groups,f, method="first", bins=100,legends = ["BPD", "Normal"], colors =['red', 'green'], lowerbound = 0,upperbound = 100):
    for g,group in enumerate(groups):
        group = groupFilter(group)
        counts = np.zeros(len(group),dtype=float)
        tweets_length = np.zeros(len(group),dtype=int)
        for i, timeSeries in enumerate(group):
            timeSeries = cleanPost(timeSeries)
            if method is None:
                http_count = f(timeSeries)
            else:

                http_count = f(timeSeries,method)
            average_count = np.sum(http_count) / timeSeries.shape[0]
            tweets_length[i] = timeSeries.shape[0]
            counts[i] =  average_count
        counts = counts[np.isfinite(counts)]
        counts = counts[(counts>lowerbound) & (counts<upperbound)]
        counts = reject_outliers(counts)
        plt.hist(counts,color=colors[g], bins = bins, edgecolor='none' )
        plt.ylabel('people')
        plt.xlabel('specific post / total post ')
        plt.title(str(len(group)) + " " + legends[g] + " people")
        plt.show()
        print("Average size of total tweets: {} std:".format(np.mean(tweets_length),np.std(tweets_length)))
        print(" Mean:{0:2f} STD:{1:2f}".format(np.mean(counts),np.std(counts)))
    

def flipDurationPlotPerUser(groups, bins=100,legends = ["BPD", "Normal"], colors =['red', 'green'], lowerbound = 0,upperbound = 100):
    print("Flip Durations of each person")
    for g,group in enumerate(groups):
        group = groupFilter(group)
        delta_times = np.zeros(len(group),dtype=float)
        for i, timeSeries in enumerate(group):
            timeSeries = cleanPost(timeSeries)
            mean_deal_time = getFlipsDurationMean(timeSeries, lowerbound=0, upperbound=upperbound)
            delta_times[i] = mean_deal_time
        delta_times = delta_times[np.isfinite(delta_times)]
        plt.hist(delta_times,color=colors[g], bins = bins, edgecolor='none' )
        plt.ylabel('Person count')
        plt.xlabel('Time (mins)')
        plt.title("{} {} people ".format(len(group),legends[g]))
        plt.show()
        print("Time Duration mean:{0:2f} STD:{1:2f}".format(np.mean(delta_times),np.std(delta_times)))










def flipDurationPlot(groups, bins=100,legends = ["BPD", "Normal"], colors =['red', 'green'], lowerbound = 0,upperbound = 100):
    print("Flip Durations of each flip")
    for g,group in enumerate(groups):
        group = groupFilter(group)
        delta_times = np.array([])
        tweets_length = np.zeros(len(group),dtype=int)

        for i,timeSeries in enumerate(group):
            timeSeries = cleanPost(timeSeries)
            flips = getFlips(timeSeries)
            delta_times = np.concatenate((delta_times,getFlipsDuration(timeSeries, flips)))
            tweets_length[i] = timeSeries.shape[0]
        delta_times =delta_times[(delta_times < upperbound) & (delta_times > lowerbound)]
        plt.hist(delta_times,color=colors[g], bins = bins, edgecolor='none' )
        plt.ylabel('Flips count')
        plt.xlabel('Time (mins)')
        plt.title(str(len(group)) + " "+legends[g] + " people")
        plt.show()
        print("Average size of total tweets: {} std:".format(np.mean(tweets_length),np.std(tweets_length)))
        print("Time Duration mean:{0:2f} STD:{1:2f}".format(np.mean(delta_times),np.std(delta_times)))





def durationPlotPerUser(groups, bins=100,legends = ["BPD", "Normal"], colors =['red', 'green'], lowerbound = 0,upperbound = 100):
    print("Durations of each person")
    for g,group in enumerate(groups):
        group = groupFilter(group)
        delta_times = np.zeros(len(group),dtype=float)
        for i, timeSeries in enumerate(group):
            timeSeries = cleanPost(timeSeries)
            mean_deal_time = getMeanDeltaTime(timeSeries, upperbound=upperbound, lowerbound=lowerbound)
            delta_times[i] = mean_deal_time
        delta_times = delta_times[np.isfinite(delta_times)]
        plt.hist(delta_times,color=colors[g], bins = bins, edgecolor='none' )
        plt.ylabel('Post count')
        plt.xlabel('Time (mins)')
        plt.title(legends[g])
        plt.show()
        print("Time Duration mean:{0:2f} STD:{1:2f}".format(np.mean(delta_times),np.std(delta_times)))

        
        
                
        


def durationPlot(groups, bins=100,legends = ["BPD", "Normal"], colors =['red', 'green'], lowerbound = 0,upperbound = 100):
    print("Durations of each tweets pair")
    for i,group in enumerate(groups):
        group = groupFilter(group)
        delta_times = np.array([])
        tweets_length = np.zeros(len(group),dtype=int)

        for timeSeries in group:
            timeSeries = cleanPost(timeSeries)
            delta_times = np.concatenate((delta_times,timeSeries['dt'][:-1].values))
            tweets_length[i] = timeSeries.shape[0]

        delta_times =delta_times[(delta_times < upperbound) & (delta_times > lowerbound)]
        plt.hist(delta_times,color=colors[i], bins = bins, edgecolor='none' )
        plt.ylabel('Post count')
        plt.xlabel('Time (mins)')
        plt.title(legends[i])
        plt.show()
        print("Average size of total tweets: {} std:".format(np.mean(tweets_length),np.std(tweets_length)))
        print("Time Duration mean:{0:2f} STD:{1:2f}".format(np.mean(delta_times),np.std(delta_times)))
            


    

In [44]:
#Fetching the data of BPD and random sampled Twitter user
BPDP_Polarties =  getUsersPolarities("patients","BPD_clean")
regular_Polarties =  getUsersPolarities("idea","regularUser_en_fixed_emotion")
bipolar_Polarties = getUsersPolarities("patients","bipolar_clean")
mix_Polarities = getUsersPolarities("patients","bb_mix")
#Transform raw tweets into timeSeries data.
BPDtimeSeries = timeSeriesTransform(BPD_Polarties) 
regular_timeSeries = timeSeriesTransform(regular_Polarties)
bipolar_timeSeries = timeSeriesTransform(bipolar_Polarties)
mix_timeSeries = timeSeriesTransform(mix)
#Filter and clean the timeSeries data
BPD_clean = groupFilter(BPDtimeSeries)
regular_clean = random.sample(groupFilter(regular_timeSeries), len(BPD_clean))
bipolar_clean = groupFilter(bipolar_timeSeries)
mix_clean = groupFilter(mix_timeSeries)


groups = [regular_clean,bipolar_clean, BPD_clean, mix_clean]
colors=['g','b', 'r','y']
legends = ["Normal", "Bipolar", "BPD","mix"]

NameError: name 'BPD_Polarties' is not defined

In [None]:
#Let's visualize 

In [None]:
#The X-axis is the time of one flip, the Y-axis is the flips count

flipDurationPlot(groups,legends=legends, colors=colors)

In [None]:
#The X-axis is the average flip time of user , the Y-axis is the user count
flipDurationPlotPerUser(groups, upperbound=30,colors=["r","g",'b'])

In [None]:
#The X-axis is the flips/all-tweets ratio, the Y-axis is users count
countPlotPerUser(groups, getFlips, method=None,colors=["r","g",'b'])

In [None]:
#The X-axis is the negative-post/all-tweets ratio, the Y-axis is users count
countPlotPerUser(groups, negative_counter, method=None)

In [None]:
#The X-axis is the positive-post/all-tweets ratio, the Y-axis is users count

countPlotPerUser(groups, positive_counter, method=None)

In [None]:
#The X-axis is the first-pronoun/all-tweets ratio, the Y-axis is users count

countPlotPerUser(groups, seriesContains) #First-pronoun

In [None]:
#The X-axis is the second-pronoun/all-tweets ratio, the Y-axis is users count

countPlotPerUser(groups, seriesContains, method="second") #second-pronoun

In [None]:
#The X-axis is the third-pronoun/all-tweets ratio, the Y-axis is users count

countPlotPerUser(groups, seriesContains, method = "third") #Third-pronoun

In [None]:
#The X-axis is the amount of negative combos, the Y-axis is combo counts

comboPlot(groups, upperbound=15, lowerbound=2, bins = 20)

In [None]:
#The 》-axis is the amount of positive combos, the Y-axis is combo counts

comboPlot(groups, upperbound=15, lowerbound=2, bins = 20, element_value=1)

In [None]:
#The X-axis is the ration of 3-or-higher negative-combo/all-tweets, the Y-axis is the user count 
comboPlotPerUser(groups,lowerbound=2,  upperbound=np.inf)   #   negative文章連發三次以上,    連發次數除以全部發文數

In [None]:
agePlot(groups)

In [None]:
genderPlot(groups)