In [1]:
import pandas as pd
import json
import os
from tqdm import tqdm_notebook as tqdm
import numpy as np
from collections import Counter
import tweepy

# Get user history IDs

In [12]:
# OAuth
keys = pd.read_csv('/usr2/mamille2/twitter/tweepy_oauth.txt', index_col=0)

auth = tweepy.OAuthHandler(keys.loc['consumer_key', 'key'], keys.loc['consumer_secret', 'key'])
auth.set_access_token(keys.loc['access_token', 'key'], keys.loc['access_secret', 'key'])

# Construct the API instance
api = tweepy.API(auth)

In [5]:
# Load data

data = pd.read_pickle('/usr2/mamille2/twitter/data/huang2016_data/huang2016_test.pkl')
print(data.columns)
print(len(data))

Index(['tweet_id', 'created_at', 'in_reply_to_status_id', 'lang',
       'retweet_count', 'user_id', 'user_screen_name', 'user_name', 'text',
       'text_no_tags', 'tags'],
      dtype='object')
26002


In [3]:
# Get user ids
uids = sorted(data['user_id'].unique().tolist())
len(uids)

10455

In [6]:
with open('/usr2/mamille2/twitter/data/huang2016_data/test_uids.txt', 'w') as f:
    for uid in uids:
        f.write('{}\n'.format(uid))

# Check for user histories

In [2]:
# Load data

data = pd.read_pickle('/usr2/mamille2/twitter/data/huang2016_data/huang2016_train.pkl')
print(data.columns)
print(len(data))

Index(['tweet_id', 'created_at', 'in_reply_to_status_id', 'lang',
       'retweet_count', 'user_id', 'user_screen_name', 'user_name', 'text',
       'text_no_tags', 'tags'],
      dtype='object')
215118


In [6]:
user_tweet_ctr = Counter(data['user_id'].tolist())
len(user_tweet_ctr)

30430

In [7]:
min(user_tweet_ctr.values())

1

In [8]:
max(user_tweet_ctr.values())

1185

In [10]:
# count of counts

counts = Counter(user_tweet_ctr.values())
sorted(counts.items())[:20]

[(1, 12569),
 (2, 5043),
 (3, 2735),
 (4, 1732),
 (5, 1185),
 (6, 928),
 (7, 655),
 (8, 574),
 (9, 491),
 (10, 420),
 (11, 320),
 (12, 279),
 (13, 268),
 (14, 218),
 (15, 206),
 (16, 170),
 (17, 159),
 (18, 117),
 (19, 135),
 (20, 110)]

# Assemble tweepy-downloaded tweets

In [2]:
for fold in ['valid', 'test']:
# for fold in ['train']:
#     data_dirpath = '/usr2/mamille2/twitter/data/huang2016_data/tweets/{}'.format(fold)
    data_dirpath = '/usr2/mamille2/twitter/data/huang2016_data/user_histories/{}'.format(fold)
    
    print(fold)

    file_lens = []
    outlines = []

    for fname in tqdm(sorted(os.listdir(data_dirpath))):
        with open(os.path.join(data_dirpath, fname)) as f:
            tweets = json.load(f)
            file_lens.append(len(tweets))

            for t in tweets:

                # Get hashtags and remove hashtags from text
                tags = [el['text'] for el in t['entities']['hashtags']]
                tag_inds = [el['indices'] for el in t['entities']['hashtags']]

                bad_inds = set()
                bad_inds_sets = [set(range(beg, end+1)) for beg,end in tag_inds]
                for s in bad_inds_sets:
                    bad_inds |= s

                short = ''.join([char for idx, char in enumerate(t['text']) if idx not in bad_inds])

                # Extract info from tweet
                info = [t['id'], t['created_at'], t['in_reply_to_status_id'], t['lang'], t['retweet_count'], 
                        t['user']['id'], t['user']['screen_name'], t['user']['name'],
                        t['text'], short, tags]

                outlines.append(info)

    print(np.mean(file_lens))

    out = pd.DataFrame(outlines, columns=['tweet_id', 'created_at', 'in_reply_to_status_id', 'lang',
                                     'retweet_count', 'user_id', 'user_screen_name', 'user_name',
                                     'text', 'text_no_tags', 'tags'])

#     out.to_pickle('/usr2/mamille2/twitter/data/huang2016_data/huang2016_{}.pkl'.format(fold))
    out.to_pickle('/usr2/mamille2/twitter/data/huang2016_data/huang2016_user_histories_{}.pkl'.format(fold))

valid



495.504761905
test



494.99047619


In [3]:
c = Counter(out['user_id'].tolist())
min(c.values())

1

In [5]:
c_c = Counter(c.values())
c_c

Counter({1: 2, 2: 3, 3: 8, 4: 43, 5: 10354})

In [9]:
out.iloc[:12]

Unnamed: 0,tweet_id,created_at,in_reply_to_status_id,lang,retweet_count,user_id,user_screen_name,user_name,text,text_no_tags,tags
0,928658511311097856,Thu Nov 09 16:20:17 +0000 2017,,en,728,12,jack,jack,We should’ve communicated faster on this (yest...,We should’ve communicated faster on this (yest...,[]
1,928401610916556800,Wed Nov 08 23:19:27 +0000 2017,,en,47,12,jack,jack,RT @fmanjoo: This is interesting https://t.co/...,RT @fmanjoo: This is interesting https://t.co/...,[]
2,928399029272166400,Wed Nov 08 23:09:12 +0000 2017,9.283705e+17,fr,1,12,jack,jack,@rsa @SquareCash Jealous,@rsa @SquareCash Jealous,[]
3,928398944966553605,Wed Nov 08 23:08:51 +0000 2017,,en,119574,12,jack,jack,RT @NASAGoddard: 🌟 . * . 🌙\n ...,RT @NASAGoddard: 🌟 . * . 🌙\n ...,[]
4,928303153010819072,Wed Nov 08 16:48:13 +0000 2017,,en,34,12,jack,jack,Another amazing conversation between @tanehisi...,Another amazing conversation between @tanehisi...,[]
5,905409290214686720,Wed Sep 06 12:36:11 +0000 2017,,en,15268,1005,nosbig,Rob,RT @Newegg: Newegg and @Intel are giving away ...,RT @Newegg: Newegg and @Intel are giving away ...,[XSeries]
6,900953560296243201,Fri Aug 25 05:30:42 +0000 2017,9.002228e+17,en,0,1005,nosbig,Rob,@thetimtracker @YouTube My wife and I would in...,@thetimtracker @YouTube My wife and I would in...,[]
7,900952773239357440,Fri Aug 25 05:27:34 +0000 2017,9.001277e+17,en,0,1005,nosbig,Rob,@TheJennTracker My wife and I wish you both th...,@TheJennTracker My wife and I wish you both th...,[]
8,790716133460611074,Tue Oct 25 00:46:31 +0000 2016,,en,0,1005,nosbig,Rob,@thetimtracker @TheJennTracker How is the bath...,@thetimtracker @TheJennTracker How is the bath...,[]
9,785612663267135488,Mon Oct 10 22:47:09 +0000 2016,,en,0,1005,nosbig,Rob,@davidcaolo Are the back catalog episodes of ...,@davidcaolo Are the back catalog episodes of ...,[]


In [6]:
out['tags'].iloc[:5]

0    [coaching, walking]
1     [women, Diversity]
2      [Toronto, Social]
3           [FollowBack]
4          [free, eBook]
Name: tags, dtype: object

In [7]:
t = tweets[0]
t

{'contributors': None,
 'coordinates': None,
 'created_at': 'Mon May 02 09:08:48 +0000 2016',
 'entities': {'hashtags': [{'indices': [108, 117], 'text': 'coaching'},
   {'indices': [124, 132], 'text': 'walking'}],
  'symbols': [],
  'urls': [],
  'user_mentions': [{'id': 2799567910,
    'id_str': '2799567910',
    'indices': [3, 16],
    'name': 'Liz Walmsley',
    'screen_name': 'walmsley_liz'}]},
 'favorite_count': 0,
 'favorited': False,
 'geo': None,
 'id': 727062271499886592,
 'id_str': '727062271499886592',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'is_quote_status': True,
 'lang': 'en',
 'place': None,
 'quoted_status_id': 727047236937498625,
 'quoted_status_id_str': '727047236937498625',
 'retweet_count': 3,
 'retweeted': False,
 'retweeted_status': {'contributors': None,
  'coordinates': None,
  'created_at': 'Mon May 02 08:49:03 +0000 2016',
  'entities