In [21]:
import os
import json
from collections import defaultdict
import requests
from PIL import Image
from io import BytesIO
import progressbar

In [8]:
# import data
directory = os.getcwd()
covid_data = json.load(open('./data/covid_tweets_100000.json', 'r'))
covid_users = [d['user'] for d in covid_data]
covid_users = list(set(covid_users))
print('Extracted {} users.'.format(len(covid_users)))

Extracted 12280 users.


In [9]:
print(covid_users[:10])

[889861054850891776, 874341910671917057, 744696682265247744, 838878823747813377, 1213814764016566274, 757400969613279232, 813890744738054150, 1237135205963530241, 737356852460060672, 706924533849128960]


In [16]:
# extract the users with covid tweets

def extract_user_features(input_dir):
    user_dict = dict()
    for file_name in sorted(os.listdir(input_dir)):
        if file_name.endswith('.json'):
            college = file_name.split('_followers')[0]
            users = open(input_dir+'/'+file_name,'r').readlines()
            users = [json.loads(u) for u in users]
            for user in users:
                if user['protected']:
                    continue
                if user['id'] in user_dict.keys():
                    user_dict[user['id']]['colleges_followed'].append(college)
                else:
                    if user['id'] in covid_users:
                        user['colleges_followed'] = [college]
                        user_dict[user['id']] = user
                    
            print('<{}> users added. Total: {}'.format(college, len(user_dict.keys())))

    return user_dict, list(user_dict.values())

In [17]:
input_dir = '/media/viet/Data/college-followers'

user_dict, users = extract_user_features(input_dir)

<ASU> users added. Total: 215
<AdelphiU> users added. Total: 225
<AmericanU> users added. Total: 304
<AuburnU> users added. Total: 402
<BU_Tweets> users added. Total: 534
<BYU> users added. Total: 605
<BallState> users added. Total: 665
<Baylor> users added. Total: 781
<BelmontUniv> users added. Total: 809
<BethelU> users added. Total: 815
<BostonCollege> users added. Total: 889
<BrandeisU> users added. Total: 904
<BrownUniversity> users added. Total: 1120
<CUBoulder> users added. Total: 1207
<Caltech> users added. Total: 1329
<CarnegieMellon> users added. Total: 1407
<CatholicUniv> users added. Total: 1443
<ChapmanU> users added. Total: 1464
<ChathamU> users added. Total: 1467
<ClarkUniversity> users added. Total: 1484
<ClarksonUniv> users added. Total: 1495
<ClemsonUniv> users added. Total: 1563
<ColoradoStateU> users added. Total: 1620
<Columbia> users added. Total: 1961
<Cornell> users added. Total: 2088
<Creighton> users added. Total: 2107
<DePaulU> users added. Total: 2150
<Drake

In [24]:
print(len(user_dict.keys()))

12280


In [19]:
# tweepy APIs for getting recent profile pictures

import data.twitter_client_1 as twitter_client_1
import data.twitter_client_2 as twitter_client_2
import data.twitter_client_3 as twitter_client_3
import data.twitter_client_4 as twitter_client_4

api_1 = twitter_client_1.get_twitter_client()
api_2 = twitter_client_2.get_twitter_client()
api_3 = twitter_client_3.get_twitter_client()
api_4 = twitter_client_4.get_twitter_client()

api_list = [api_1,api_2,api_3,api_4]

# def get_new_url(user_id, api):
#     try:
#         user = api.get_user(user_id = user_id)
#         user = user._json
#         new_profile_url = user['profile_image_url']
#         return new_profile_url
#     except Exception as e:
#         print(user_id)
#         print(e)
#         pass

In [40]:
# get user image from urls

users_with_image = []

with progressbar.ProgressBar(max_value=len(covid_users)) as bar:
    for i in range(len(covid_users)):
        user = user_dict[covid_users[i]]
        url = user['profile_image_url']
        try:
            url = url.replace('_normal', '')
            response = requests.get(url, allow_redirects=True)
            image_file = BytesIO(response.content)
            image = Image.open(image_file)
            image.save('./data/user-images/{}.png'.format(user['id'], image.size[0], image.size[1]))
            users_with_image.append(user)
            
        except:
#             print(e)
            try:
                user_old = user
                user = (api_list[i%4].get_user(user_id = user['id']))._json
                user['colleges_followed'] = user_old['colleges_followed']
                url = user['profile_image_url']
                url = url.replace('_normal', '')
                response = requests.get(url, allow_redirects=True)
                image_file = BytesIO(response.content)
                image = Image.open(image_file)
#                 print(image.size)
                image.save('./data/user-images/{}.png'.format(user['id']))
                users_with_image.append(user)
            except Exception as e:
                print(e)
                pass
        
        bar.update(len(users_with_image))

  2% (300 of 12280) |                    | Elapsed Time: 0:01:25 ETA:   1:04:17

[{'code': 50, 'message': 'User not found.'}]


 82% (10085 of 12280) |##############    | Elapsed Time: 0:58:17 ETA:   0:15:49

[{'code': 50, 'message': 'User not found.'}]


 86% (10575 of 12280) |###############   | Elapsed Time: 1:00:57 ETA:   0:12:55

Unsupported BMP compression (1)


 97% (11948 of 12280) |################# | Elapsed Time: 1:08:40 ETA:   0:01:42

[{'code': 50, 'message': 'User not found.'}]


100% (12280 of 12280) |##################| Elapsed Time: 1:10:24 Time:  1:10:24


In [41]:
# sanity check

print(len(users_with_image))

12276


In [52]:
# update the image path in data file

with open('./data/covid_users_100000.jsonl', 'w') as f:
    for u in users_with_image:
        u['img_path'] = './data/user-images/'+str(u['id'])+'.png'
        f.write(json.dumps(u)+'\n')