In [269]:
from anytree import Node, PreOrderIter, RenderTree
from anytree.importer import DictImporter

from pymongo import MongoClient

from scipy import stats

from bson import ObjectId

from deepdiff import DeepDiff

import json
import ast
import numpy as np



class Cascade:
    def create_cascade(self, file_name, ground_truth):
        # read json
        # read each cascade
        # save each cascade with it's ground truth to the database
        # data structure
        '''
        {
        'node_sequence' : []
        'ground_truth' : boolean
        }
        '''
        print('create_cascade: ', file_name, ground_truth )
        file = open(file_name)
        data = json.load(file)
        
        importer = DictImporter()
        root = importer.import_(data)
#         print(RenderTree(root))
        
        # Connect to MongoDB    
        client = MongoClient('mongodb://localhost:27017/')
        db = client.create_database('tweets')
        records = db['cascades']
        for leaf in PreOrderIter(root, filter_ = lambda node: node.is_leaf):
            print([node.name for node in list(leaf.path)])
            try:
                records.insert_one({'node_sequence' : [node.name for node in list(leaf.path)], 'ground_truth' : ground_truth})
            except Exception as e:
                print(e)
                
    def cascade_analysis(self):
        # number of false and true cascades
        # max false and true cascades
        # mean false and true cascades
        
        # Connect to MongoDB    
        client = MongoClient('mongodb://localhost:27017/')
        db = client.create_database('tweets')
        records = db['cascades']
        
        print('false count: ', records.count_documents({'ground_truth': False}))
        lengths = [len(record['node_sequence']) for record in records.find({'ground_truth': False})]
        print('false average number of nodes:', sum(lengths)/len(lengths))
        print('false max number of nodes', max(lengths))
        print("*"*5)
        print('True count: ', records.count_documents({'ground_truth': True}))
        lengths = [len(record['node_sequence']) for record in records.find({'ground_truth': True})]
        print('true average number of nodes:', sum(lengths)/len(lengths)) 
        print('true max number of nodes', max(lengths))
    
    def flattern(self, lis):
        result = []
        for item in lis:
            if hasattr(item, '__iter__') and not isinstance(item, str):
                result.extend(self.flattern(item))
            else:
                result.append(item)
        return result
        
    def map_node_features(self, emotional_words):
        emotional_vector = {
             'fear': 0.0,
             'anger': 0.0,
             'anticipation': 0.0,
             'trust': 0.0,
             'surprise': 0.0,
             'positive': 0.0,
             'negative': 0.0,
             'sadness': 0.0,
             'disgust': 0.0,
             'joy': 0.0
        }
        
        # return default emotional vector if the emotional_words list is empty
        if not bool(emotional_words):
            return emotional_vector
        
        all_emotional_words = self.flattern([emotional_words[key] for key in emotional_words])
        for emotion in all_emotional_words:
            emotional_vector[emotion] += 1/len(all_emotional_words) 
        return emotional_vector  
        
    def create_node(self, node_id):
        # read database
        # return all emotionl words and entities
        # populate_cascade
        '''
        node_features_sequence: [{
            'user_id': str 
            'emotional_frequecy': 
                {
                 'fear': 0.14285714285714285,
                 'anger': 0.2857142857142857,
                 'anticip': 0.0,
                 'trust': 0.0,
                 'surprise': 0.0,
                 'positive': 0.0,
                 'negative': 0.42857142857142855,
                 'sadness': 0.14285714285714285,
                 'disgust': 0.0,
                 'joy': 0.0
                 },
            'emotional_mean': 
                {
                 'fear': 0.14285714285714285,
                 'anger': 0.2857142857142857,
                 'anticip': 0.0,
                 'trust': 0.0,
                 'surprise': 0.0,
                 'positive': 0.0,
                 'negative': 0.42857142857142855,
                 'sadness': 0.14285714285714285,
                 'disgust': 0.0,
                 'joy': 0.0
                 },
            'emotional_standard_deviation': 
                {
                 'fear': 0.14285714285714285,
                 'anger': 0.2857142857142857,
                 'anticip': 0.0,
                 'trust': 0.0,
                 'surprise': 0.0,
                 'positive': 0.0,
                 'negative': 0.42857142857142855,
                 'sadness': 0.14285714285714285,
                 'disgust': 0.0,
                 'joy': 0.0
                 },
             'emotional_q1': 
                {
                 'fear': 0.14285714285714285,
                 'anger': 0.2857142857142857,
                 'anticip': 0.0,
                 'trust': 0.0,
                 'surprise': 0.0,
                 'positive': 0.0,
                 'negative': 0.42857142857142855,
                 'sadness': 0.14285714285714285,
                 'disgust': 0.0,
                 'joy': 0.0
                 },
             'emotional_q2': 
                {
                 'fear': 0.14285714285714285,
                 'anger': 0.2857142857142857,
                 'anticip': 0.0,
                 'trust': 0.0,
                 'surprise': 0.0,
                 'positive': 0.0,
                 'negative': 0.42857142857142855,
                 'sadness': 0.14285714285714285,
                 'disgust': 0.0,
                 'joy
                 ': 0.0
                 },
             'emotional_q3': 
                {
                 'fear': 0.14285714285714285,
                 'anger': 0.2857142857142857,
                 'anticip': 0.0,
                 'trust': 0.0,
                 'surprise': 0.0,
                 'positive': 0.0,
                 'negative': 0.42857142857142855,
                 'sadness': 0.14285714285714285,
                 'disgust': 0.0,
                 'joy': 0.0
                 },
            'followers_count': int
            'friends_count': int
            'listed_count': int
            'statuses_count': int
            'created_at': Time
        }]
        '''
        # Connect to MongoDB    
        client = MongoClient('mongodb://localhost:27017/')
        db = client.get_database('tweets')
        records = db['timeline_extended']
        
        # each node is a user
        node = records.find_one({'user_id': node_id})
        emotion_timeline = []
        for tweet in node['timeline']:
            emotional_words = tweet['full_text_nlp']['emotional_words']
            emotional_words = ast.literal_eval(emotional_words)
            emotion_timeline.append(self.map_node_features(emotional_words))
        
        emotional_vector = {
             'fear': 0.0,
             'anger': 0.0,
             'anticipation': 0.0,
             'trust': 0.0,
             'surprise': 0.0,
             'positive': 0.0,
             'negative': 0.0,
             'sadness': 0.0,
             'disgust': 0.0,
             'joy': 0.0
        }
#         print(len(emotion_timeline))
        dup_emotional_vector = emotional_vector
        
        # emotional frequnecy
        for key in emotional_vector:
            emotional_vector[key] = sum(item[key] for item in emotion_timeline)
        emotional_frequency = emotional_vector
#         print(emotional_frequency)
        emotional_vector = dup_emotional_vector
        
        # emotional mean
        for key in emotional_vector:
            emotional_vector[key] = np.mean([item[key] for item in emotion_timeline])
        emotional_mean = emotional_vector
#         print(emotional_mean)
        emotional_vector = dup_emotional_vector

        # emotional standard deviation
        for key in emotional_vector:
            emotional_vector[key] = np.std([item[key] for item in emotion_timeline])
        emotional_std = emotional_vector
#         print(emotional_std)
        emotional_vector = dup_emotional_vector  
        
        # emotional q1
        for key in emotional_vector:
            emotional_vector[key] = np.quantile([item[key] for item in emotion_timeline], 0.25)
        emotional_q1 = emotional_vector
#         print(emotional_q1)
        emotional_vector = dup_emotional_vector       
        
        # emotional q2
        for key in emotional_vector:
            emotional_vector[key] = np.quantile([item[key] for item in emotion_timeline], 0.5)
        emotional_q2 = emotional_vector
#         print(emotional_q2)
        emotional_vector = dup_emotional_vector
        
        # emotional q3
        for key in emotional_vector:
            emotional_vector[key] = np.quantile([item[key] for item in emotion_timeline], 0.75)
        emotional_q3 = emotional_vector
#         print(emotional_q3)
        emotional_vector = dup_emotional_vector
        
        # create user features
        if node['timeline']:
            followers_count = node['timeline'][0]['user']['followers_count']
            friends_count = node['timeline'][0]['user']['friends_count']
            listed_count = node['timeline'][0]['user']['listed_count']
            statuses_count = node['timeline'][0]['user']['statuses_count']
            created_at = node['timeline'][0]['user']['created_at']
        
        # create a dict
        a_dict = {
            'user_id' : node_id,
            'emotional_frequency' : emotional_frequency,
            'emotional_mean' : emotional_mean,
            'emotional_std' : emotional_std,
            'emotional_q1' : emotional_q1,
            'emotional_q2' : emotional_q2,
            'emotional_q3' : emotional_q3,
            'followers_count': followers_count,
            'friends_count': friends_count,
            'listed_count': listed_count,
            'statuses_count': statuses_count,
            'created_at': created_at
        }
#         print('*'*10)
#         print(a_dict)
        
        return a_dict
    
    def populate_node_feature_sequence(self, _id):
        # Connect to MongoDB    
        client = MongoClient('mongodb://localhost:27017/')
        db = client.get_database('tweets')
        records = db['cascades']
        
        _id = ObjectId(_id)
        
        node_feature_sequence = []
        
        for node in records.find_one({'_id' : _id})['node_sequence']:
            a_dict = self.get_node(node)
            node_feature_sequence.append(a_dict)
        
        print(node_feature_sequence)
        # write to database
        records.update_one({'_id': _id}, {'$set' : { 'node_feature_sequence' : node_feature_sequence}})
    
    def populate_me(self):
        # Connect to MongoDB    
        client = MongoClient('mongodb://localhost:27017/')
        db = client.get_database('tweets')
        records = db['cascades']
        
        for record in records.find({}):
            id_str = record['_id']
            print(id_str)
            self.populate_node_feature_sequence(id_str)
        


class MultivariateTimeSeriesModal:
    series = []
    
    # reshape data as single dataset where each row is a timestep and each column is a seperate time series
    '''
    'matrix' : [[]]
    '''
    def create_vector(self, _id):
        
        # Connect to MongoDB    
        client = MongoClient('mongodb://localhost:27017/')
        db = client.get_database('tweets')
        records = db['cascades']
        
        _id = ObjectId(_id)
        
        node_feature_sequence = records.find_one({'_id': _id})['node_feature_sequence']
#         print(len(node_feature_sequence))
        
        matrix = []
        s = node_feature_sequence
        
        for time_step in s:
            series = self.get_all_values(time_step)
            matrix.append(self.series)
#         print(matrix)
        
        # write to database
        records.update_one({'_id': _id}, {'$set' : { 'matrix' : matrix}})
        
    def get_all_values(self, d):   
        for key, value in d.items():
            if type(value) is dict:
                self.get_all_values(value)
            else:
                self.series.append(value)

    def populate_me(self):
        # Connect to MongoDB    
        client = MongoClient('mongodb://localhost:27017/')
        db = client.get_database('tweets')
        records = db['cascades']
        
        for record in records.find({}):
            id_str = record['_id']
            print(id_str)
            self.create_vector(id_str)
    
    

In [270]:
objt = MultivariateTimeSeriesModal()

In [271]:
objt.populate_me()

611a751f136ce8c98ad4c242
611a751f136ce8c98ad4c243
611a751f136ce8c98ad4c244
611a751f136ce8c98ad4c245
611a751f136ce8c98ad4c246
611a751f136ce8c98ad4c247
611a751f136ce8c98ad4c248
611a751f136ce8c98ad4c249
611a751f136ce8c98ad4c24a
611a751f136ce8c98ad4c24b
611a751f136ce8c98ad4c24c
611a751f136ce8c98ad4c24d
611a751f136ce8c98ad4c24e
611a751f136ce8c98ad4c24f
611a751f136ce8c98ad4c250
611a751f136ce8c98ad4c251
611a751f136ce8c98ad4c252
611a751f136ce8c98ad4c253
611a751f136ce8c98ad4c254
611a751f136ce8c98ad4c255
611a751f136ce8c98ad4c256
611a751f136ce8c98ad4c257
611a751f136ce8c98ad4c258
611a751f136ce8c98ad4c259
611a751f136ce8c98ad4c25a
611a751f136ce8c98ad4c25b
611a751f136ce8c98ad4c25c
611a751f136ce8c98ad4c25d
611a751f136ce8c98ad4c25e
611a751f136ce8c98ad4c25f
611a751f136ce8c98ad4c260
611a751f136ce8c98ad4c261
611a751f136ce8c98ad4c262
611a751f136ce8c98ad4c263
611a751f136ce8c98ad4c264
611a751f136ce8c98ad4c265
611a751f136ce8c98ad4c266
611a751f136ce8c98ad4c267
611a751f136ce8c98ad4c268
611a751f136ce8c98ad4c269


KeyError: 'node_feature_sequence'

In [193]:
obj = Cascade()

In [160]:
# obj.populate_node_feature_sequence('611a751f136ce8c98ad4c242')

In [194]:
obj.cascade_analysis()

false count:  8847
false average number of nodes: 7.127161749745676
false max number of nodes 15
*****
True count:  316
true average number of nodes: 3.1835443037974684
true max number of nodes 10


In [76]:
li = [1,2,3,4,5]
# li = np.array(li)

In [77]:
np.std(li)

1.4142135623730951

In [168]:
for p,c in zip(li[:-1],li[1:]):
    print(c, p, p-c)

2 1 -1
3 2 -1
4 3 -1
5 4 -1


In [172]:
d1 = {'me': 5, 'l': {'l': 10}}
d2 = {'me': 3, 'l': {'l': 3}}

print(DeepDiff(d1, d2, ))

{'values_changed': {"root['me']": {'new_value': 3, 'old_value': 5}, "root['l']['l']": {'new_value': 3, 'old_value': 10}}, 'deep_distance': 0.14285714285714285}


In [211]:
l1 = np.array([1,1,1,1]).reshape(2,2)
l2 = np.array([2,2,2])

l1

array([[1, 1],
       [1, 1]])