In [1]:
import os
import sys
import time
import datetime
import json
import pandas as pd

import tweepy

sys.path.insert(1, '../')
from src.data import journalists as journos
from src.data.api_tweepy import connect_API
from src.data.api_user_tools import query_user_relationship, batch_request_user_info

In [2]:
api_keys_fp = '../data/twitter_credentials.json'
tw_api = connect_API(api_keys_fp)

In [9]:
trial = tw_api.show_friendship(source_screen_name='kimknilsson', target_screen_name='jlesliedata')

In [3]:
output = query_user_relationship(tw_api, 'ellen_dev', 'bobthephysicist')

{'AfollowsB': True, 'BfollowsA': True}

In [43]:
keyword = 'education'
journo_handles = journos.get_handles_by_keyword(keyword)
print(len(journo_handles))

49


In [44]:
user_trial = batch_request_user_info(tw_api, journo_handles)

In [45]:
len(user_trial)

49

In [49]:
def populate_user_df(user_data):
    '''
    Take a series of tweepy user objects and transform them into a dataframe.
    
    Params
    ------
    user_data : list
        List of tweepy.User objects 
    
    Returns
    -------
    user_df : Pandas DataFrame
        Cleaned dataframe with user data.
    '''
    df_fields = ['user_id', 'screen_name',  'name', 'location', 'user_description', 
              'user_friends_n', 'user_followers_n', 'prof_created_at', 'favourites_count', 
              'verified', 'statuses_count']
    
    API_fields = ['id', 'screen_name', 'name', 'location', 'description', 
                  'friends_count', 'followers_count', 'created_at', 'favourites_count',
                 'verified', 'statuses_count']
    # Go through tweepy user objects and pull relevant data into lists
    full_list = []
    for user in user_data:
        user_vars = vars(user)
        data_list = [user_vars[field] for field in API_fields]
        full_list.append(data_list)
    # Translate data list into dataframe
    user_df = pd.DataFrame(full_list, columns=df_fields)
    # Additional cleaning of data in dataframe
    user_df['screen_name'] = user_df['screen_name'].str.lower()
    
    return user_df

In [50]:
test_df = populate_user_df(user_trial)

In [51]:
test_df.head(10)

Unnamed: 0,user_id,screen_name,name,location,user_description,user_friends_n,user_followers_n,prof_created_at,favourites_count,verified,statuses_count
0,314483971,alicewoolley1,Alice Woolley,"London, Surrey, Oxon, Arizona",Journalist. Editor of Education Guardian. Euro...,976,20847,2011-06-10 10:12:52,26062,True,33836
1,13265,richarda,Richard Adams,United Kingdom,The Guardian's education editor - email me at ...,2577,18926,2006-11-20 15:41:59,9682,True,31175
2,46346940,sarahjewell21,Sarah Jewell,London,Commercial features editor at Guardian News an...,655,354,2009-06-11 08:39:23,14,False,1597
3,360667680,camillahmturner,Camilla Turner,London,Education Editor @Telegraph. Views are my own....,2277,4008,2011-08-23 15:35:56,29,False,2993
4,391350249,suzigodson,suzi godson,"London, England",Sex columnist for The Times newspaper. Co-foun...,4562,5143,2011-10-15 12:49:11,4191,True,4713
5,87180626,rosiedbennett,Rosemary Bennett,London,Education Editor for The Times. Unhealthy obse...,1347,9408,2009-11-03 12:40:13,4929,True,3941
6,330204167,ajack,Andrew Jack,London,Global education editor @FT Free schools acces...,1456,6781,2011-07-06 08:11:24,780,True,6101
7,127830995,kitchenbee,Bee Wilson,,I write about food. And other things. Books in...,1691,26193,2010-03-30 11:01:02,21976,False,23872
8,163242135,tamcohen,Tamara Cohen,Westminster,"Political Correspondent, Sky News. And at the ...",2174,34231,2010-07-05 22:44:21,788,True,10800
9,224592767,branwenjeffreys,branwen jeffreys,LONDON a great world city,"BBC Education Editor - runs slow, reads faster...",2223,27869,2010-12-09 11:20:55,52,False,13027


In [52]:
cyber_df = pd.read_csv('../data/raw/cyber_journalist_friends_2.csv')

In [53]:
cyber_df.head()

Unnamed: 0,screen_name,friend
0,jennystrasburg,RobaHusseini
1,jennystrasburg,HashemOsseiran
2,jennystrasburg,liveanthony
3,jennystrasburg,EliseKapNM
4,jennystrasburg,adam_tooze


In [56]:
friends = list(cyber_df['friend'].unique())

In [57]:
friends_profiles = batch_request_user_info(tw_api, friends)

In [58]:
friends_df = populate_user_df(friends_profiles)

In [59]:
friends_df.head(20)

Unnamed: 0,user_id,screen_name,name,location,user_description,user_friends_n,user_followers_n,prof_created_at,favourites_count,verified,statuses_count
0,363675945,robahusseini,Roba El Husseini ربى,Lebanon,Journalist @AFP covering Syria and Lebanon. Tw...,1244,2567,2011-08-28 14:13:43,649,False,1838
1,2546033492,hashemosseiran,Hashem Osseiran,Beirut,Beirut-based @AFP journalist | Views my own | ...,1678,2003,2014-06-04 12:41:32,493,False,2381
2,179178993,liveanthony,Anthony Galloway,"New York, NY",2x @TeamUSA triathlete | Global Head of Video ...,454,2303,2010-08-16 18:06:32,3920,True,2249
3,1325420790,elisekapnm,Elise Kaplan,"Albuquerque, New Mexico",Bearer of bad news / criminal justice reporter...,741,1566,2013-04-03 21:07:55,3552,False,3074
4,3311286493,adam_tooze,Adam Tooze,"New York, NY","History, economics, politics, climate. Columbi...",4931,95042,2015-08-10 07:35:58,6969,False,36633
5,117543420,bartongellman,Barton Gellman,New York & on the road,Staff writer @TheAtlantic | Author 'Dark Mirro...,850,44841,2010-02-25 21:55:14,73,True,8181
6,25598396,nhannahjones,Ida Bae Wells,nhannahjones@nytimes.com,Reporter @nytmag covering race from 1619-prese...,2128,426025,2009-03-20 23:26:56,35251,True,74436
7,2433239862,southeastrocu,SEROCU,South East,SEROCU is a collaboration between the Police F...,298,1905,2014-04-08 07:51:13,1270,False,1689
8,282288629,euirim,Euirim Choi,"Stanford, CA",Covering tech for @WSJ | Computer Science @Sta...,471,913,2011-04-14 22:52:08,579,False,484
9,216602554,davecolephoto,Dave Cole,"New York, NY",@WSJ Photo Editor. Pennsylvanian.,1691,2104,2010-11-17 06:05:25,12744,False,12442


In [60]:
friends_df.to_csv('../data/cleaned/cyber_user_profiles.csv', index=False)

In [69]:
friends_df['user_friends_n'].sum()

53348775

### Repeat for Education list

In [61]:
edu_df = pd.read_csv('../data/raw/education_journalist_friends.csv')

In [62]:
edu_df.head()

Unnamed: 0,screen_name,friend
0,alicewoolley1,FloraBarton
1,alicewoolley1,KateWilliamsme
2,alicewoolley1,AdviserSuper
3,alicewoolley1,SolGamsu
4,alicewoolley1,sarahcpr


In [67]:
edufriends = list(edu_df['friend'].unique())

In [68]:
edufriends_profiles = batch_request_user_info(tw_api, edufriends)

In [70]:
edufriends_df = populate_user_df(edufriends_profiles)

In [71]:
edufriends_df.head(20)

Unnamed: 0,user_id,screen_name,name,location,user_description,user_friends_n,user_followers_n,prof_created_at,favourites_count,verified,statuses_count
0,379197262,florabarton,Ms Barton,United Kingdom,Mother of two incredible children & Headteache...,1731,6682,2011-09-24 14:54:23,19008,False,10222
1,403075133,katewilliamsme,Prof Kate Williams,London,"history prof. ❤️ in Wales. On TV for @CNN, @BB...",7837,111839,2011-11-02 00:10:38,154608,True,67901
2,954089220200595460,advisersuper,Super Special Adviser,Westminster,A secret super special adviser for Gavin Willi...,1028,1500,2018-01-18 20:32:51,47,False,978
3,1169644826,solgamsu,Sol Gamsu,Newcastle/Durham,"Lecturer @durhamsociology. Mostly researching,...",5485,5080,2013-02-11 18:18:55,25232,False,18464
4,14642495,sarahcpr,Sarah Cooper,"Brooklyn, NY",writer / comedian / #blockedbytrump / wrote 10...,2990,2221914,2008-05-03 23:30:29,139163,True,37518
5,227258135,jason_arthur,Jason Arthur,"London, England",Deputy Chief Exec (Interim) @iwill_campaign | ...,1745,2999,2010-12-16 10:23:55,10439,False,6021
6,18243837,rowennadavis,Rowenna Davis,"Croydon, England","English teacher @harrisinvictus, Labour activi...",832,14580,2008-12-19 15:40:29,1258,False,5115
7,1242887487313391616,ukcovid19stats,UK COVID-19,United Kingdom,#Coronavirus dashboard for the #UnitedKingdom ...,122,54386,2020-03-25 18:54:18,1700,False,4982
8,101326157,mannyawo,Emmanuel Awoyelu,Where legends are born,Educator/SENCo/Governor - Director at @thereac...,1050,4739,2010-01-02 22:11:46,25742,False,77797
9,1123199104815964160,ybtn_uk,#BlackLivesMatter,,Connecting young black teachers in England thr...,693,1531,2019-04-30 12:15:09,464,False,3513


In [72]:
edufriends_df.to_csv('../data/cleaned/education_user_profiles.csv', index=False)

In [73]:
edufriends_df['user_friends_n'].sum()

160718652

### Repeat for Politics list

In [75]:
poli_df = pd.read_csv('../data/raw/politic_journalist_friends.csv')

In [76]:
keyword = 'political'
journo_handles = journos.get_handles_by_keyword(keyword)
print(len(journo_handles))

236


In [81]:
poli_df.shape

(875304, 2)

In [80]:
poli_df_subset = poli_df[poli_df['screen_name'].isin(journo_handles)]

In [88]:
poli_df_subset['screen_name'] = poli_df_subset['screen_name'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [89]:
poli_df_subset.head()

Unnamed: 0,screen_name,friend
0,shippersunbound,danharris80
1,shippersunbound,NatalieElphicke
2,shippersunbound,KermodeMovie
3,shippersunbound,JoshNorris
4,shippersunbound,timlevell


In [83]:
polifriends = list(poli_df_subset['friend'].unique())

In [84]:
polifriends_profiles = batch_request_user_info(tw_api, polifriends)

In [85]:
polifriends_df = populate_user_df(polifriends_profiles)

In [86]:
polifriends_df.head(20)

Unnamed: 0,user_id,screen_name,name,location,user_description,user_friends_n,user_followers_n,prof_created_at,favourites_count,verified,statuses_count
0,458470994,danharris80,Dan Harris,,"Editor-in-Chief, @Fantasypros/@BettingProsNFL....",767,12864,2012-01-08 16:09:06,1322,False,6263
1,1154535918,natalieelphicke,Natalie Elphicke MP,Dover & Deal,Conservative Member of Parliament for Dover & ...,945,5214,2013-02-06 17:05:48,1293,True,1802
2,111323990,kermodemovie,Mark Kermode,last aisle seat on the left,Author HOW DOES IT FEEL? @ScalaRadio Sat. @Ob...,1130,581297,2010-02-04 14:41:34,14909,True,40754
3,19752348,joshnorris,Josh Norris,,The early bird gets first dibs at the salad ba...,1268,71388,2009-01-30 03:37:38,1062,True,22972
4,19530491,timlevell,Tim Levell,Manchester,Programme Director of @TimesRadio. Formerly @B...,4398,4230,2009-01-26 11:58:14,3960,False,4212
5,26668729,john_hudson,John Hudson,"Washington, D.C.",I cover diplomacy & national security for The ...,3111,62620,2009-03-26 02:39:24,8823,True,21137
6,1325563478,ianblackford_mp,Ian Blackford,Isle of Skye,"Member of parliament for Ross, Skye and Lochab...",383,93396,2013-04-03 22:16:28,7774,True,14649
7,46427981,meggiefoster,Meggie Foster,"London, England",I can't believe I can't believe it's not butte...,873,118936,2009-06-11 16:42:02,68,False,2876
8,1263091525417910273,yngvlgrn,MLC,,here to have a good time,1066,779,2020-05-20 12:57:48,14205,False,4543
9,1261218289797271557,journoworldcup,Journalist World Cup 2020,,"64 Journalists, only one winner.",80,1254,2020-05-15 08:54:13,0,False,72


In [93]:
polifriends_df.to_csv('../data/cleaned/political_user_profiles.csv', index=False)

In [91]:
polifriends_df['user_friends_n'].sum()

323372678

In [90]:
poli_df_subset.to_csv('../data/cleaned/political_journalist_friends.csv', index=False)

In [92]:
len(polifriends)

148408