In [1]:
import json
import pickle
import numpy as np
import gzip
import math
import csv
import datetime
import copy
import dateutil.parser
import random
%matplotlib inline
import matplotlib.pyplot as plt
import pylab as pl
import seaborn as sns
from scipy.stats import pearsonr, spearmanr, ttest_ind, mannwhitneyu, kruskal, ks_2samp
from statsmodels.distributions.empirical_distribution import ECDF
import location
from utils import Utils
from tweet import Tweet
from video import Video

## [abo | gun | blm]
campaign = 'blm'

bin_size = 7
connection_type = 'followers'
year = 2018

util = Utils(campaign, bin_size, connection_type, year)

tweet = Tweet(util)
video = Video(util)


  **kwargs


## Create available early adopters, their tweets, locations, political leanings.

In [None]:
ea_tweets = tweet.getAvailableTweets(0.2)
print('#tweets by available early adopters: {}'.format(len(ea_tweets.keys())))

ea_tweets_counts = tweet.getTweetVolumeDistribution(ea_tweets)
util.plotLineChart(ea_tweets_counts, "# of tweets", "Weeks")

# Sort tweet counts by weekly
ea_tweets_counts_sorted_ind, dates_sorted = util.sortVolumeByDate(ea_tweets_counts)


In [None]:
## Methods for tweet-video-user analysis

# Analysis of videos by tweet volume within a specific date
# tweet.sortVideosByTweetVolumeForSpecificInterval(ea_tweets, '2018-04-15', '2018-04-22')

# Print tweets that refer to specific video within a specific week
# tweet.printTweetsByVideoId(ea_tweets, '2017-01-01', '2018-05-01', 'Rw9E_nGVXxA')

# Analyze tweets of most-tweeted user.
# tweet.analyzeTweetsOfMostTweetedUsers(ea_tweets)

In [None]:
## Find early adopters' locations

# tweet.assignUserLocations(ea_tweets)

In [None]:
## Find political leanings of seed early adopters.

# First find occurrence of hashtags (potential ones for expansion) with seed political hashtags. 
# hashtags, ht_occur_vec, cooccur_mat = tweet.findHashtagCooccurrences()

# Then, manually analyze these hashtags, and decide which hashtags to select for political leaning expansion.
# Create 'left_political_hashtags_extended.txt' and 'right_political_hashtags_extended.txt' manually.

# Based on extended political hashtags, assign political labels to seed early adopters.
# leaning_labels = tweet.assignUserLeaningLabels()


In [None]:


#tweet.summarizeUsersTweetsInfo(available_tweets)
#tweet.summarizeFollowers()


## community summarization
#tweet.checkConnectionChanges()
#tweet.summarizeCommunities()
## (Optional after community detection) check political leanings for each community.
# predefined_community_leanings, inferred_community_leanings = tweet.checkLeaningsInCommunities()


In [None]:
## Assign User political leanings




In [None]:
video_ids = tweet.sortVideosByTweetVolumeForSpecificInterval(available_tweets, '2017-01-01', '2018-05-01')
video_ids = [video_id[0] for video_id in video_ids]
print(len(video_ids))
#pickle.dump(video_ids, open('{}_video_ids_from_available_tweets.pkl'.format(util.campaign), 'wb'))


In [None]:
## Assign Video leanings
filtered_videos = tweet.getVideoTweetsUsers(available_tweets)

plt.rcParams['figure.dpi'] = 400
# Draw #available_users per video
users_per_video = []
for video_id in filtered_videos:
    users_per_video.append(len(filtered_videos[video_id]['uids']))
users_per_video = np.array(users_per_video)
users_per_video.sort()
#ecdf = ECDF(np.log10(np.cumsum(users_per_video)))
ecdf = ECDF(np.log10(users_per_video))
plt.plot(ecdf.x, ecdf.y, linestyle="--", linewidth=1.5)
plt.xlabel("#users (log10)", fontsize=12)
plt.ylabel("CDF", fontsize=12)
plt.show()

# Draw #available_tweets per video
tweets_per_video = []
for video_id in filtered_videos:
    tweets_per_video.append(len(filtered_videos[video_id]['tids']))
tweets_per_video = np.array(tweets_per_video)
tweets_per_video.sort()
#ecdf = ECDF(np.log10(np.cumsum(tweets_per_video)))
ecdf = ECDF(np.log10(tweets_per_video))
plt.plot(ecdf.x, ecdf.y, linestyle="--", linewidth=1.5)
plt.xlabel("#tweets (log10)", fontsize=12)
plt.ylabel("CDF", fontsize=12)
plt.show()


video_leanings_probs = tweet.assignVideoLeaningLabels(available_tweets)

video_ids = list(video_leanings_probs.keys())
print(len(video_ids))

popular_videos_probs_right = []
popular_videos_probs_size = []

for video_id in video_leanings_probs:
    popular_videos_probs_right.append(video_leanings_probs[video_id]['right'])
    popular_videos_probs_size.append(len(filtered_videos[video_id]['uids']))

# Draw leaning probability (right) and popularity (#users sharing the video)
plt.scatter(popular_videos_probs_right, popular_videos_probs_size, c=popular_videos_probs_right, cmap="bwr")
plt.xlabel("Right leaning prob.", fontsize=12)
plt.ylabel("#users promoting the video", fontsize=12)
#plt.legend(prop={'size': 10})
plt.show()

# Draw probability distribution (right) of video leanings
N, bins, patches = pl.hist(popular_videos_probs_right, 20)
print('len_patches:', len(patches))
jet = pl.get_cmap('bwr', len(patches))
for i in range(len(patches)):
    patches[i].set_facecolor(jet(i))
pl.xlabel('Right leaning prob.', fontsize=12)
pl.ylabel('#videos', fontsize=12)


## CASCADE OPERATIONS ##

In [None]:
## Create sub_followings_dict once to make the cascade computation cost efficient.
#tweet.createSubFollowingsDictForCascade()

## Calculate cascade for each video in filtered video set.
#cascades = tweet.getAllTweetCascades(available_tweets)

## Anaylze cascades
video_political_leaning_thr = 0.6
video_leanings_probs = tweet.assignVideoLeaningLabels(available_tweets)
tweet.analyzeCascadeMeasures(video_leanings_probs)


In [None]:
## create min cascade tree for a specific video to plot it.
min_user_size = 50
max_user_size = 70
min_leaning_prob = 0.7
max_leaning_prob = 0.9

filtered_videos = tweet.getVideoTweetsUsers(available_tweets)
video_leanings_probs = tweet.assignVideoLeaningLabels(available_tweets)
video_ids = list(video_leanings_probs.keys())

for vid in filtered_videos:
    num_users = len(filtered_videos[vid]['uids'])
    if num_users >= min_user_size and num_users <= max_user_size:
        if video_leanings_probs[vid]['left'] >= min_leaning_prob and video_leanings_probs[vid]['left'] <= max_leaning_prob:
            print('Left:', vid, len(filtered_videos[vid]['uids']), video_leanings_probs[vid]['left'])
        elif video_leanings_probs[vid]['right'] >= min_leaning_prob and video_leanings_probs[vid]['right'] <= max_leaning_prob:
            print('Right:', vid, len(filtered_videos[vid]['uids']), video_leanings_probs[vid]['right'])



left_candidate_vid = 'ewLkgfpH6fk' #(68 0.785)
left_candidate_vid = 'IYGdeiy0jEw' #(64 0.755)
right_candidate_vid = '-R6w07kSjjE' #(65 0.763)
        

_, cascade_left, users_immediate_neighbors_left = tweet.getTweetCascadeByVideoId(available_tweets, left_candidate_vid)
_, cascade_right, users_immediate_neighbors_right = tweet.getTweetCascadeByVideoId(available_tweets, right_candidate_vid)

print(cascade_left)

print('left cascade for {}:'.format(left_candidate_vid), np.mean([cascade_left[uid]['min'] for uid in cascade_left]), np.median([cascade_left[uid]['min'] for uid in cascade_left]), np.amax([cascade_left[uid]['min'] for uid in cascade_left]))
print('right cascade for {}:'.format(right_candidate_vid), np.mean([cascade_right[uid]['min'] for uid in cascade_right]), np.median([cascade_right[uid]['min'] for uid in cascade_right]), np.amax([cascade_right[uid]['min'] for uid in cascade_right]))




In [None]:
filtered_video_ids = tweet.getFilteredVideoIds()
a = tweet.findEarlyAdopters(0.2, available_tweets, filtered_video_ids)
ids = list(a.keys())

## ENGAGEMENT MEASURE OPERATIONS ##

In [None]:
# Calculate engagement measures for each video in filtered video set.
#engagement_measures = tweet.getAllEngagementMeasures(available_tweets)


## Anaylze engagement measures for each video in filtered video set.
video_political_leaning_thr = 0.6
measure_type = 'num_replies'
video_leanings_probs = tweet.assignVideoLeaningLabels(available_tweets)
tweet.analyzeEngagementMeasures(measure_type, video_leanings_probs)

## STRUCTURAL MEASURE OPERATIONS ##

In [None]:
# Calculate structural measures for each video in filtered video set.
# nw_type = ['potential' | 'temporal']
nw_type = 'potential'
#structural_measures = tweet.getAllNetworkStructureMeasures(available_tweets, nw_type)


## Anaylze structural measures for each video in filtered video set.
#video_political_leaning_thr = 0.6
measure_type = 'nw_size'
video_leanings_probs = tweet.assignVideoLeaningLabels(available_tweets)
tweet.analyzeStructuralMeasures(measure_type, nw_type, video_leanings_probs)


In [None]:
#### Analyze cascades  w.r.t. network size
video_political_leaning_thr = 0.6
video_leanings_probs = tweet.assignVideoLeaningLabels(available_tweets)

median_nw_size = np.median([structural_measures[vid]['nw_size'] for vid in structural_measures])
max_nw_size = np.amax([structural_measures[vid]['nw_size'] for vid in structural_measures])
print(median_nw_size)
print(max_nw_size)

video_leanings_probs_for_small_networks = {}
video_leanings_probs_for_large_networks = {}
for vid in structural_measures:
    if structural_measures[vid]['nw_size'] <= median_nw_size:
        video_leanings_probs_for_small_networks[vid] = copy.deepcopy(video_leanings_probs[vid])
    else:
        video_leanings_probs_for_large_networks[vid] = copy.deepcopy(video_leanings_probs[vid])
        
tweet.analyzeCascadeMeasures(video_leanings_probs_for_large_networks)


In [None]:
#### Analyze the videos with 0 gini (indegree centrality)
video_political_leaning_thr = 0.6
video_leanings_probs = tweet.assignVideoLeaningLabels(available_tweets)

left_videos = [vid for vid in video_leanings_probs if video_leanings_probs[vid]['left'] > video_political_leaning_thr 
               and structural_measures[vid]['nw_in_degree_centrality_gini'] == 0]
right_videos = [vid for vid in video_leanings_probs if video_leanings_probs[vid]['right'] > video_political_leaning_thr 
               and structural_measures[vid]['nw_in_degree_centrality_gini'] == 0]
neutral_videos = [vid for vid in video_leanings_probs if video_leanings_probs[vid]['right'] <= video_political_leaning_thr 
                  and video_leanings_probs[vid]['right'] >= (1 - video_political_leaning_thr) 
                  and structural_measures[vid]['nw_in_degree_centrality_gini'] == 0]

print('left videos:', [(vid, structural_measures[vid]['nw_in_degree_centrality_mean']) for vid in left_videos])
print('right videos:', [(vid, structural_measures[vid]['nw_in_degree_centrality_mean']) for vid in right_videos])
print('neutral videos:', [(vid, structural_measures[vid]['nw_in_degree_centrality_mean']) for vid in neutral_videos])

## TEMPORAL MEASURE OPERATIONS ##

In [None]:
# Calculate temporal measures for each video in filtered video set.
#temporal_measures = tweet.getAllTemporalMeasures(available_tweets)

#nw_temporal_diff_wrt_first_tweet_mean, nw_temporal_diff_wrt_first_tweet_median, nw_temporal_diff_between_pairs_mean, nw_diff_speed_mnw_temporal_diff_between_pairs_median, nw_life_time, nw_temporal_diff_between_first_tweets_of_source_users_mean, nw_temporal_diff_between_first_tweets_of_source_users_median, nw_temporal_diff_between_max_indegree_user = tweet.getTemporalMeasuresByVideoId(available_tweets, 'Iqc4BrAzDio')
#print(nw_temporal_diff_wrt_first_tweet_mean, nw_temporal_diff_wrt_first_tweet_median, nw_temporal_diff_between_pairs_mean, nw_diff_speed_mnw_temporal_diff_between_pairs_median, nw_life_time, nw_temporal_diff_between_first_tweets_of_source_users_mean, nw_temporal_diff_between_first_tweets_of_source_users_median, nw_temporal_diff_between_max_indegree_user)
#'lpskBk0fp4o'
#'Iqc4BrAzDio'


## Anaylze temporal measures for each video in filtered video set.
video_political_leaning_thr = 0.6
measure_type = 'nw_temporal_diff_between_max_indegree_user'
video_leanings_probs = tweet.assignVideoLeaningLabels(available_tweets)
tweet.analyzeTemporalMeasures(measure_type, video_leanings_probs)

## GEOGRAPHICAL MEASURE OPERATIONS ##

In [None]:
## Calculate geographical measures for each video in filtered video set.
#geo_measures = tweet.getAllNetworkGeographicalMeasures(available_tweets)

## Anaylze geographical measures for each video in filtered video set.
video_political_leaning_thr = 0.6
measure_type = 'nw_locs'
video_leanings_probs = tweet.assignVideoLeaningLabels(available_tweets)
left_locs, right_locs, neutral_locs = tweet.analyzeGeographicalMeasures(measure_type, video_leanings_probs)


from colour import Color
import plotly.graph_objects as go
colorscale = [
[0, "#FFFFFF"], [0.02, "#F2FFFB"], [0.04, "#E8FFF8"], [0.06, "#DEFFF5"], [0.08, "#D4FFF2"], [0.1, "#CAFFEF"], [0.12, "#C0FFEB"], [0.14, "#B6FFE8"], [0.16, "#ACFEE4"],
[0.18, "#A2FCE0"], [0.2, "#98FBDC"], [0.22, "#8DF9D8"], [0.24, "#84F7D4"], [0.26, "#7CF5D0"], [0.28, "#73F2CB"], [0.3, "#6CEFC7"], [0.32, "#64ECC2"],
[0.34, "#5CE9BE"], [0.36, "#55E6B9"], [0.38, "#4EE2B4"], [0.4, "#48DEAF"], [0.42, "#41DAAA"], [0.44, "#3BD5A5"], [0.46, "#35D0A0"], [0.48, "#2FCB9A"],
[0.5, "#2AC695"], [0.52, "#26C392"], [0.54, "#22BF8E"], [0.56, "#1EBC8A"], [0.58, "#1AB886"], [0.6, "#16B383"], [0.62, "#13AF7F"], [0.64, "#10AB7B"],
[0.66, "#0DA677"], [0.68, "#0AA172"], [0.7, "#079C6E"], [0.72, "#05976A"], [0.74, "#039166"], [0.76, "#018B61"], [0.78, "#00855D"], [0.8, "#007F58"], [0.82, "#007953"], [0.84, "#00734E"],
[0.86, "#006C49"], [0.88, "#006444"], [0.9, "#005C3F"], [0.92, "#00543A"], [0.94, "#004C35"], [0.96, "#004330"], [0.98, "#003B2A"], [1, "#003325"]
]

num_colors = 21
col_intervals = np.arange(0., 1.001, 1.0/(num_colors-1))
white = Color("blue")
colors = list(white.range_to(Color("red"), num_colors))
print(colors[1], len(colors))
colorscale = []
for i in range(num_colors):
    colorscale.append([col_intervals[i], str(colors[i])])

#print(colorscale)

state_list = list(location.getStates().keys())

fig = go.Figure(data=go.Choropleth(
    #locations=df['code'], # Spatial coordinates
    #z = df['total exports'].astype(float), # Data to be color-coded
    locations=state_list, # Spatial coordinates
    z = [dict(neutral_locs)[loc] for loc in state_list], # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    #colorscale = "rdbu",
    colorscale = "Reds",
    #reversescale=True,
    #colorscale='blues',
    #colorbar_title = "#users",
    colorbar_title = "Mean user participation (%)",
    #zmin=0.,
    #zmax=1.
))

fig.update_layout(
    title_text = 'User geo-participation for Neutral Videos',
    #title_text = 'Right-wing communities (Gun Control)',
    geo_scope='usa', # limite map scope to USA
    #geo_scope='north america', # limite map scope to USA
)

fig.show()

#fig.write_image("abo_right_locs.png")
#pio.write_image(fig, 'images/fig1.png')




## LANGUAGE MEASURE OPERATIONS ##

In [None]:
# Calculate language measures for each video in filtered video set.
#language_liwc_measures, language_empath_measures = tweet.getAllLanguageMeasures(available_tweets)


## Anaylze language measures for each video in filtered video set.
video_political_leaning_thr = 0.6
dict_type = 'empath'
measure_type = 'heroic'
video_leanings_probs = tweet.assignVideoLeaningLabels(available_tweets)
tweet.analyzeLanguageMeasures(dict_type, measure_type, video_leanings_probs)


## Anaylze language measures for each video in filtered video set.
video_political_leaning_thr = 0.6
leaning = 'right'
dict_type = 'liwc'
measure_type = 'negate'
video_leanings_probs = tweet.assignVideoLeaningLabels(available_tweets)
tweet.analyzeTweetsByLanguageCategory(dict_type, measure_type, video_leanings_probs, available_tweets, leaning)


In [None]:
## VIDEO MEASURE ANALYSIS ##

video_political_leaning_thr = 0.6
measure_type = 'duration'
video_leanings_probs = tweet.assignVideoLeaningLabels(available_tweets)
video_measures = tweet.analyzeVideosByProperty(video_leanings_probs, measure_type)


In [None]:
## VIDEO MEASURE ANALYSIS (NORMALIZED) ##

video_political_leaning_thr = 0.6
measure_type = 'commentCount'
video_leanings_probs = tweet.assignVideoLeaningLabels(available_tweets)
video_measures = tweet.analyzeVideosByNormalizedProperty(video_leanings_probs, measure_type)


In [None]:
#util.createCandidateVideosFile()
#util.createVideosFileForManualAnnotations()
#util.createVideosFileForManualAnnotationsUsingSubtitles()

## VIDEO POLARITY, INTENSITY, DIVISIVENESS and POPULARITY ANALYSIS ##

In [None]:
video_political_leaning_thr = 0.6
video_leanings_probs = tweet.assignVideoLeaningLabels(available_tweets)
video_measures = tweet.analyzeDivisiveContentsByScatterPlots(video_leanings_probs)



In [None]:
video_leanings_probs = tweet.assignVideoLeaningLabels(available_tweets)
for vid in video_leanings_probs:
    print(vid, video_leanings_probs[vid])

In [None]:
tweet.analyzeAssortativityLeaningRelation(available_tweets, 'tKM3UXstwa0')

In [None]:
#util.compareAgendasPopulations('num_tweets', 'gun', 'abo')

In [None]:
video_leanings_probs = tweet.assignVideoLeaningLabels(available_tweets)
tweet.separateVideosByLeaning(video_leanings_probs)