In [47]:
from pymongo import MongoClient
import numpy as np
import pandas as pd

In [45]:
# Data Fetching, transformation and filtering


def getLangRatio(cursor):
    lang_ratios = {}
    for tweet in cursor:
        lang = 1 if tweet["lang"] == "en" else 0
        user_id = tweet["user"]["id"]
        if user_id in lang_ratios:
            lang_ratios[user_id].append(lang)
        else:
            lang_ratios[user_id] = [lang]
    for user_id, ratio in lang_ratios.items():
        lang_ratios[user_id] = np.sum(ratio) / len(ratio)
    return lang_ratios
                                   

def getUsersTweets(dbName,collectionName, en_threshold=0.9):
    cursor = MongoClient("localhost", 27017)[dbName][collectionName].find()
    lang_ratios = getLangRatio(cursor)

    cursor = MongoClient("localhost", 27017)[dbName][collectionName].find()
    usersTweets = {}
    for tweet in cursor:
        userID = tweet["user"]["id"]
        if lang_ratios[userID] < en_threshold:
            continue
        #Processing emotions from Carlos' API
        emotion =  tweet["emotion"]["groups"][0]["name"]
        if len(tweet["emotion"]["groups"]) > 1:
            emotion_2 = tweet["emotion"]["groups"][1]["name"]
            
        ambiguous = True if tweet['emotion']['ambiguous'] == 'yes' else False
       
        if len(tweet["emotion"]["groups"]) > 1:
            emotion_2 = tweet["emotion"]["groups"][1]["name"]    
        else:
            emotion_2 = None
        if tweet["polarity"] == "positive":
            polarity = 1
        elif tweet["polarity"] == "negative":
            polarity = -1
        else:
            polarity = 0
   
            
        date = tweet["created_at"]
        text = tweet['text']

        if userID not in usersTweets:
            usersTweets[userID] = {}
        if date not in usersTweets[userID]:
            usersTweets[userID][date] = {}
            
        usersTweets[userID][date]['text'] = text
        usersTweets[userID][date]['polarity'] =  polarity
        usersTweets[userID][date]['emotion'] =  emotion
        usersTweets[userID][date]['emotion_2'] =  emotion_2
        usersTweets[userID][date]['ambiguous'] =  ambiguous
    return usersTweets



def timeSeriesTransform(usersEmotions):
    for userID in usersEmotions:
        usersEmotions[userID] = pd.DataFrame.from_dict(usersEmotions[userID], orient='index').fillna(0)
        usersEmotions[userID]['dt'] = np.zeros(usersEmotions[userID].shape[0],dtype=float)
        usersEmotions[userID].loc[:-1,'dt'] = (usersEmotions[userID].index[1:].values - usersEmotions[userID].index[:-1].values).astype('timedelta64[s]') / np.timedelta64(60, 's')
    return list(usersEmotions.values())


def getHTTPRows(timeSeries):
    count = 0
    patterns = ['http://','https://']
    conditions = timeSeries['text'].str.contains(patterns[0])
    for pattern in patterns[1:]:
        conditions = conditions | timeSeries['text'].str.contains(pattern)

    return conditions.values

def userFilter(group, spam_threshold=0.5,tweets_threshold=100):    #Spam and inactive user filter
    new_group = []
    for timeSeries in group:
        http_rows = getHTTPRows(timeSeries)
        average_http_count = np.sum(http_rows) / timeSeries.shape[0]
        if (average_http_count < spam_threshold) and (timeSeries.shape[0] > tweets_threshold):
            new_group.append(timeSeries)
    return new_group

class Group(object):
    
    
    
    def __init__(self, name, group=None, spam_threshold=0.5, tweets_threshold=100, **kwargs):
        self.name = name

        dbName  = kwargs.get("dbName", None)
        collectionName  = kwargs.get("collectionName", None)
        
        if dbName is None or collectionName is None:
            self.group = group
        else:

            self.group = userFilter(timeSeriesTransform(getUsersTweets(dbName,collectionName)), spam_threshold=spam_threshold, tweets_threshold=tweets_threshold)
        
 
        
    def getTexts(self, tail_k = "all"):
        if tail_k == "all":
            return ["\n".join(timeSeries["text"].values) for timeSeries in self.group]
        else:
            return ["\n".join(timeSeries["text"].tail(tail_k).values) for timeSeries in self.group]

    
    def getName(self):
        return self.name
    def getSize(self):
        return len(self.group)
    def __repr__(self):
        return repr(self.group)
    def __add__(self, other):
        return Group(self.name, self.group + other.group)
    
    def getGroup(self,tail_k="all"):
        if tail_k == "all":
            return self.group
        else:
            return [timeSeries.tail(tail_k) for timeSeries in self.group]
    
    
    def __iadd__(self, other):
        self.group += other.group
        return self    
    



In [48]:





BPD_group_clean = Group("BPD",dbName = "patients",collectionName="BPD_clean")
regular_group = Group("Random Samples",dbName = "idea",collectionName="regularUser_en_fixed_emotion")
bipolar_group_clean = Group("Bipolar",dbName ="patients", collectionName="bipolar_clean")
mix_group = Group("Mix", dbName = "patients",collectionName="bb_mix")


BPD_all = BPD_group_clean + mix_group
bipolar_all = bipolar_group_clean + mix_group

In [None]:
def getUserID(dbName, collectionName):
    collection = MongoClient("localhost", 27017)[dbName][collectionName]
    cursor = collection.find()
    user_ids = {}
    for tweet in cursor:
        lang = 1 if tweet["lang"] == "en" else 0
        user_id = tweet["user"]["id"]
        if user_id not in filtered_id
        user_ids.add(user_id)
        if user_id in user_langs:
            user_langs[user_id].append(lang)
        else:
            user_langs[user_id] = [lang]
    return user_langs


In [19]:
def getLangRatio(dbName, collectionName):
    
    collection = MongoClient("localhost", 27017)[dbName][collectionName]
    cursor = collection.find()
    user_langs = {}
    for tweet in cursor:
        lang = 1 if tweet["lang"] == "en" else 0
        user_id = tweet["user"]["id"]
        if user_id in user_langs:
            user_langs[user_id].append(lang)
        else:
            user_langs[user_id] = [lang]
    return user_langs

In [28]:
user_langs = getLangRatio("patients", "BPD_clean")

In [33]:
BPD_lang_ratios = [(user_id, np.sum(langs_ratio)/len(langs_ratio)) for user_id, langs_ratio in user_langs.items()]


In [37]:
BPD_lang_ratios = sorted(BPD_lang_ratios, key=lambda x: x[1])

for user_id, en_ratio in BPD_lang_ratios:
    if en_ratio < 0.75:
        print(user_id, en_ratio)

3262241903 0.0
944577823 0.119863013699
269690229 0.220713609094
2413555298 0.31308411215
3324571407 0.32
3263225837 0.333333333333
3077233757 0.380952380952
169951404 0.415136755339
244021118 0.430107526882
45229455 0.495652173913
3194839751 0.5
116439735 0.510220125786
510585906 0.550855085509
3227127058 0.551724137931
374595521 0.571428571429
2954272784 0.57215007215
2292426456 0.6
558473669 0.621212121212
202200525 0.644128113879
2936388081 0.654624277457
49450953 0.700718525461
2251489214 0.714285714286
3228645394 0.717948717949
3297759413 0.728155339806
58007127 0.731600375822
287681418 0.747753973739


In [38]:
user_langs = getLangRatio("patients", "bipolar_clean")
bipolar_lang_ratios = [(user_id, np.sum(langs_ratio)/len(langs_ratio)) for user_id, langs_ratio in user_langs.items()]
bipolar_lang_ratios = sorted(bipolar_lang_ratios, key=lambda x: x[1])

for user_id, en_ratio in bipolar_lang_ratios:
    if en_ratio < 0.75:
        print(user_id, en_ratio)

933392432 0.0
153294609 0.0172705722994
3029977273 0.0310130944176
2827792882 0.0769230769231
809571888 0.0880503144654
2499362502 0.127009646302
3367338663 0.173913043478
1229003286 0.185840707965
2286202992 0.188405797101
2984096968 0.2
3120472413 0.210365853659
504686644 0.261935483871
2921022655 0.317073170732
368501489 0.354838709677
553466344 0.385796545106
192116423 0.39165085389
3182719408 0.4
1962364130 0.407407407407
2425691041 0.425634824667
2600004696 0.439393939394
3041204445 0.475409836066
70032674 0.480043149946
3238987287 0.5
3076152329 0.539748953975
1015217155 0.558333333333
962084311 0.561849710983
1247445650 0.591133004926
1408777951 0.593103448276
206804977 0.608804402201
88995343 0.612752721617
29587967 0.61733615222
348873140 0.617647058824
582230868 0.624203821656
101816365 0.625
2751352106 0.636363636364
2950280698 0.645161290323
770070936 0.646706586826
102816119 0.655367231638
28725721 0.661290322581
3307369534 0.666666666667
2531462476 0.666666666667
3322592

In [None]:
lang_ratios