In [70]:
from anytree import Node, PreOrderIter, RenderTree
from anytree.importer import DictImporter

from pymongo import MongoClient

from scipy import stats

from bson import ObjectId

import json
import ast
import numpy as np



class Cascade:
    def get_cascade(self, file_name, ground_truth):
        # read json
        # read each cascade
        # save each cascade with it's ground truth to the database
        # data structure
        '''
        {
        'node_sequence' : []
        'ground_truth' : boolean
        }
        '''
        print('get_cascade: ', file_name, ground_truth )
        file = open(file_name)
        data = json.load(file)
        
        importer = DictImporter()
        root = importer.import_(data)
#         print(RenderTree(root))
        
        # Connect to MongoDB    
        client = MongoClient('mongodb://localhost:27017/')
        db = client.get_database('tweets')
        records = db['cascades']
        for leaf in PreOrderIter(root, filter_ = lambda node: node.is_leaf):
            print([node.name for node in list(leaf.path)])
            try:
                records.insert_one({'node_sequence' : [node.name for node in list(leaf.path)], 'ground_truth' : ground_truth})
            except Exception as e:
                print(e)
                
    def cascade_analysis(self):
        # number of false and true cascades
        # max false and true cascades
        # mean false and true cascades
        
        # Connect to MongoDB    
        client = MongoClient('mongodb://localhost:27017/')
        db = client.get_database('tweets')
        records = db['cascades']
        
        print('false count: ', records.count_documents({'ground_truth': False}))
        lengths = [len(record['node_sequence']) for record in records.find({'ground_truth': False})]
        print('false average number of nodes:', sum(lengths)/len(lengths))
        print('false max number of nodes', max(lengths))
        print("*"*5)
        print('True count: ', records.count_documents({'ground_truth': True}))
        lengths = [len(record['node_sequence']) for record in records.find({'ground_truth': True})]
        print('true average number of nodes:', sum(lengths)/len(lengths)) 
        print('true max number of nodes', max(lengths))
    
    def flattern(self, lis):
        result = []
        for item in lis:
            if hasattr(item, '__iter__') and not isinstance(item, str):
                result.extend(self.flattern(item))
            else:
                result.append(item)
        return result
        
    def map_node_features(self, emotional_words):
        emotional_matrix = {
             'fear': 0.0,
             'anger': 0.0,
             'anticipation': 0.0,
             'trust': 0.0,
             'surprise': 0.0,
             'positive': 0.0,
             'negative': 0.0,
             'sadness': 0.0,
             'disgust': 0.0,
             'joy': 0.0
        }
        
        # return default emotional matrix if the emotional_words list is empty
        if not bool(emotional_words):
            return emotional_matrix
        
        all_emotional_words = self.flattern([emotional_words[key] for key in emotional_words])
        for emotion in all_emotional_words:
            emotional_matrix[emotion] += 1/len(all_emotional_words) 
        return emotional_matrix  
        
    def get_node(self, node_id):
        # read database
        # return all emotionl words and entities
        # populate_cascade
        '''
        node_features_sequence: [{
            'user_id': str 
            'emotional_frequecy': 
                {
                 'fear': 0.14285714285714285,
                 'anger': 0.2857142857142857,
                 'anticip': 0.0,
                 'trust': 0.0,
                 'surprise': 0.0,
                 'positive': 0.0,
                 'negative': 0.42857142857142855,
                 'sadness': 0.14285714285714285,
                 'disgust': 0.0,
                 'joy': 0.0
                 },
            'emotional_mean': 
                {
                 'fear': 0.14285714285714285,
                 'anger': 0.2857142857142857,
                 'anticip': 0.0,
                 'trust': 0.0,
                 'surprise': 0.0,
                 'positive': 0.0,
                 'negative': 0.42857142857142855,
                 'sadness': 0.14285714285714285,
                 'disgust': 0.0,
                 'joy': 0.0
                 },
            'emotional_standard_deviation': 
                {
                 'fear': 0.14285714285714285,
                 'anger': 0.2857142857142857,
                 'anticip': 0.0,
                 'trust': 0.0,
                 'surprise': 0.0,
                 'positive': 0.0,
                 'negative': 0.42857142857142855,
                 'sadness': 0.14285714285714285,
                 'disgust': 0.0,
                 'joy': 0.0
                 },
             'emotional_q1': 
                {
                 'fear': 0.14285714285714285,
                 'anger': 0.2857142857142857,
                 'anticip': 0.0,
                 'trust': 0.0,
                 'surprise': 0.0,
                 'positive': 0.0,
                 'negative': 0.42857142857142855,
                 'sadness': 0.14285714285714285,
                 'disgust': 0.0,
                 'joy': 0.0
                 },
             'emotional_q2': 
                {
                 'fear': 0.14285714285714285,
                 'anger': 0.2857142857142857,
                 'anticip': 0.0,
                 'trust': 0.0,
                 'surprise': 0.0,
                 'positive': 0.0,
                 'negative': 0.42857142857142855,
                 'sadness': 0.14285714285714285,
                 'disgust': 0.0,
                 'joy
                 ': 0.0
                 },
             'emotional_q3': 
                {
                 'fear': 0.14285714285714285,
                 'anger': 0.2857142857142857,
                 'anticip': 0.0,
                 'trust': 0.0,
                 'surprise': 0.0,
                 'positive': 0.0,
                 'negative': 0.42857142857142855,
                 'sadness': 0.14285714285714285,
                 'disgust': 0.0,
                 'joy': 0.0
                 },
            'followers_count': int
            'friends_count': int
            'listed_count': int
            'statuses_count': int
            'created_at': Time
        }]
        '''
        # Connect to MongoDB    
        client = MongoClient('mongodb://localhost:27017/')
        db = client.get_database('tweets')
        records = db['timeline_extended']
        
        # each node is a user
        node = records.find_one({'user_id': node_id})
        emotion_timeline = []
        for tweet in node['timeline']:
            emotional_words = tweet['full_text_nlp']['emotional_words']
            emotional_words = ast.literal_eval(emotional_words)
            emotion_timeline.append(self.map_node_features(emotional_words))
        
        emotional_matrix = {
             'fear': 0.0,
             'anger': 0.0,
             'anticipation': 0.0,
             'trust': 0.0,
             'surprise': 0.0,
             'positive': 0.0,
             'negative': 0.0,
             'sadness': 0.0,
             'disgust': 0.0,
             'joy': 0.0
        }
#         print(len(emotion_timeline))
        dup_emotional_matrix = emotional_matrix
        
        # emotional frequnecy
        for key in emotional_matrix:
            emotional_matrix[key] = sum(item[key] for item in emotion_timeline)
        emotional_frequency = emotional_matrix
#         print(emotional_frequency)
        emotional_matrix = dup_emotional_matrix
        
        # emotional mean
        for key in emotional_matrix:
            emotional_matrix[key] = np.mean([item[key] for item in emotion_timeline])
        emotional_mean = emotional_matrix
#         print(emotional_mean)
        emotional_matrix = dup_emotional_matrix

        # emotional standard deviation
        for key in emotional_matrix:
            emotional_matrix[key] = np.std([item[key] for item in emotion_timeline])
        emotional_std = emotional_matrix
#         print(emotional_std)
        emotional_matrix = dup_emotional_matrix  
        
        # emotional q1
        for key in emotional_matrix:
            emotional_matrix[key] = np.quantile([item[key] for item in emotion_timeline], 0.25)
        emotional_q1 = emotional_matrix
#         print(emotional_q1)
        emotional_matrix = dup_emotional_matrix       
        
        # emotional q2
        for key in emotional_matrix:
            emotional_matrix[key] = np.quantile([item[key] for item in emotion_timeline], 0.5)
        emotional_q2 = emotional_matrix
#         print(emotional_q2)
        emotional_matrix = dup_emotional_matrix
        
        # emotional q3
        for key in emotional_matrix:
            emotional_matrix[key] = np.quantile([item[key] for item in emotion_timeline], 0.75)
        emotional_q3 = emotional_matrix
#         print(emotional_q3)
        emotional_matrix = dup_emotional_matrix
        
        # get user features
        if node['timeline']:
            followers_count = node['timeline'][0]['user']['followers_count']
            friends_count = node['timeline'][0]['user']['friends_count']
            listed_count = node['timeline'][0]['user']['listed_count']
            statuses_count = node['timeline'][0]['user']['statuses_count']
            created_at = node['timeline'][0]['user']['created_at']
        
        # create a dict
        a_dict = {
            'user_id' : node_id,
            'emotional_frequency' : emotional_frequency,
            'emotional_mean' : emotional_mean,
            'emotional_std' : emotional_std,
            'emotional_q1' : emotional_q1,
            'emotional_q2' : emotional_q2,
            'emotional_q3' : emotional_q3,
            'followers_count': followers_count,
            'friends_count': friends_count,
            'listed_count': listed_count,
            'statuses_count': statuses_count,
            'created_at': created_at
        }
#         print('*'*10)
#         print(a_dict)
        
        return a_dict
    
    def populate_node_feature_sequence(self, _id):
        # Connect to MongoDB    
        client = MongoClient('mongodb://localhost:27017/')
        db = client.get_database('tweets')
        records = db['cascades']
        
        _id = ObjectId(_id)
        
        node_feature_sequence = []
        
        for node in records.find_one({'_id' : _id})['node_sequence']:
            a_dict = self.get_node(node)
            node_feature_sequence.append(a_dict)
        
        print(node_feature_sequence)
        # write to database
        records.update_one({'_id': _id}, {'$set' : { 'node_feature_sequence' : node_feature_sequence}})
    
    def get_relationship(self):
        # get cascades and get nodes
        # return relationship features
        # populate cascade
        '''
        relationship_sequence: [{
        'node_differece':{
        
        },
        'node_mean'{
        
        },
        'cascade_mean'
        }]
        '''
        
# class SequenceClassification:
#     def model_lstm(self):

#     def model_rnn(self):
                
#     def model_cnn(self):

In [71]:
obj = Cascade()

In [72]:
# obj.get_node(1137361298067460097)

In [74]:
obj.populate_node_feature_sequence('611a751f136ce8c98ad4c242')

[{'user_id': 1379138530841034752, 'emotional_frequency': {'fear': 0.0, 'anger': 0.0, 'anticipation': 0.1294642857142857, 'trust': 0.25, 'surprise': 0.0, 'positive': 0.4, 'negative': 0.12708333333333333, 'sadness': 0.0, 'disgust': 0.0, 'joy': 0.0}, 'emotional_mean': {'fear': 0.0, 'anger': 0.0, 'anticipation': 0.1294642857142857, 'trust': 0.25, 'surprise': 0.0, 'positive': 0.4, 'negative': 0.12708333333333333, 'sadness': 0.0, 'disgust': 0.0, 'joy': 0.0}, 'emotional_std': {'fear': 0.0, 'anger': 0.0, 'anticipation': 0.1294642857142857, 'trust': 0.25, 'surprise': 0.0, 'positive': 0.4, 'negative': 0.12708333333333333, 'sadness': 0.0, 'disgust': 0.0, 'joy': 0.0}, 'emotional_q1': {'fear': 0.0, 'anger': 0.0, 'anticipation': 0.1294642857142857, 'trust': 0.25, 'surprise': 0.0, 'positive': 0.4, 'negative': 0.12708333333333333, 'sadness': 0.0, 'disgust': 0.0, 'joy': 0.0}, 'emotional_q2': {'fear': 0.0, 'anger': 0.0, 'anticipation': 0.1294642857142857, 'trust': 0.25, 'surprise': 0.0, 'positive': 0.4,

In [335]:
obj.cascade_analysis()

false count:  8848
false average number of nodes: 7.126695298372513
false max number of nodes 15
*****
True count:  316
true average number of nodes: 3.1835443037974684
true max number of nodes 10


In [26]:
li = [1,2,3,3]
# li = np.array(li)

In [27]:
np.std(li)

0.82915619758885

In [28]:
if not li:
    print('empty')