In [2]:
import pymongo 
import json
import requests
import pandas as pd
import numpy as np
from pymongo import MongoClient
from bson.objectid import ObjectId


# pprint library is used to make the output look more pretty
from pprint import pprint

In [3]:
link = 'http://api.nobelprize.org/v1/prize.json'

In [4]:
# connect to API
response = requests.get(link)

In [5]:
print(response.status_code) # 200 means all good

200


In [6]:
print(response.json()) # Checking JSON resposnse

{'prizes': [{'year': '2019', 'category': 'chemistry', 'laureates': [{'id': '976', 'firstname': 'John', 'surname': 'Goodenough', 'motivation': '"for the development of lithium-ion batteries"', 'share': '3'}, {'id': '977', 'firstname': 'M. Stanley', 'surname': 'Whittingham', 'motivation': '"for the development of lithium-ion batteries"', 'share': '3'}, {'id': '978', 'firstname': 'Akira', 'surname': 'Yoshino', 'motivation': '"for the development of lithium-ion batteries"', 'share': '3'}]}, {'year': '2019', 'category': 'economics', 'laureates': [{'id': '982', 'firstname': 'Abhijit', 'surname': 'Banerjee', 'motivation': '"for their experimental approach to alleviating global poverty"', 'share': '3'}, {'id': '983', 'firstname': 'Esther', 'surname': 'Duflo', 'motivation': '"for their experimental approach to alleviating global poverty"', 'share': '3'}, {'id': '984', 'firstname': 'Michael', 'surname': 'Kremer', 'motivation': '"for their experimental approach to alleviating global poverty"', 's

In [7]:
type(response.json())

dict

In [8]:
nobel_dict = response.json()

In [9]:
nobel_dict.keys()

dict_keys(['prizes'])

In [10]:
# Normalize semi-structured JSON data into a flat table
from pandas.io.json import json_normalize

df_nobel = json_normalize(nobel_dict['prizes']) 

  after removing the cwd from sys.path.


In [11]:
df_nobel.head()

Unnamed: 0,year,category,laureates,overallMotivation
0,2019,chemistry,"[{'id': '976', 'firstname': 'John', 'surname':...",
1,2019,economics,"[{'id': '982', 'firstname': 'Abhijit', 'surnam...",
2,2019,literature,"[{'id': '980', 'firstname': 'Peter', 'surname'...",
3,2019,peace,"[{'id': '981', 'firstname': 'Abiy', 'surname':...",
4,2019,physics,"[{'id': '973', 'firstname': 'James', 'surname'...","""for contributions to our understanding of the..."


In [12]:
### Question 1: How many Nobel prizes were given per category?
df_nobel_groups = df_nobel.groupby('category').count()

In [13]:
df_nobel_groups

Unnamed: 0_level_0,year,laureates,overallMotivation
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
chemistry,119,111,11
economics,51,51,0
literature,119,112,7
medicine,119,110,9
peace,119,100,19
physics,119,113,11


In [14]:
### Question 2: Flatten the Data Structure.
df_nobel['laureates'][99]

[{'id': '773',
  'firstname': 'Shirin',
  'surname': 'Ebadi',
  'motivation': '"for her efforts for democracy and human rights. She has focused especially on the struggle for the rights of women and children"',
  'share': '1'}]

In [15]:
type(df_nobel['laureates'][0])

list

In [16]:
df_nobel2 = df_nobel # defining a new one in case I mess with the old one
length = len(df_nobel.index)

laur_df = pd.DataFrame()
for item in range(0, length):
    try:
        new_df = pd.DataFrame(df_nobel2['laureates'][item])
        new_df['year'] = df_nobel2['year'][item]
        new_df['category'] = df_nobel2['category'][item]

        laur_df = pd.concat([laur_df, new_df])
    except ValueError: # Get a ValueError in case of NaNs
        pass

In [17]:
laur_df    

Unnamed: 0,id,firstname,surname,motivation,share,year,category
0,976,John,Goodenough,"""for the development of lithium-ion batteries""",3,2019,chemistry
1,977,M. Stanley,Whittingham,"""for the development of lithium-ion batteries""",3,2019,chemistry
2,978,Akira,Yoshino,"""for the development of lithium-ion batteries""",3,2019,chemistry
0,982,Abhijit,Banerjee,"""for their experimental approach to alleviatin...",3,2019,economics
1,983,Esther,Duflo,"""for their experimental approach to alleviatin...",3,2019,economics
...,...,...,...,...,...,...,...
0,569,Sully,Prudhomme,"""in special recognition of his poetic composit...",1,1901,literature
0,462,Henry,Dunant,"""for his humanitarian efforts to help wounded ...",2,1901,peace
1,463,Frédéric,Passy,"""for his lifelong work for international peace...",2,1901,peace
0,1,Wilhelm Conrad,Röntgen,"""in recognition of the extraordinary services ...",1,1901,physics


In [18]:
### Question 3: How many Nobel prizes were given to people called ‘Michael’?
len(laur_df[laur_df['firstname'] == 'Michael'])

4

In [19]:
### Question 4: What is the smallest relative share of a Nobel prize ever given?
laur_df.groupby(['year', 'category'])['category'].count()

year  category  
1901  chemistry     1
      literature    1
      medicine      1
      peace         2
      physics       1
                   ..
2019  economics     3
      literature    1
      medicine      3
      peace         1
      physics       3
Name: category, Length: 597, dtype: int64

In [20]:
laur_df.groupby(['year', 'category'])['category'].count().max()
# Answer: 1/3

3

In [42]:
### Question 5: Which laureates were awarded multiple prizes?
#mult = laur_df.groupby(['id', 'firstname', 'surname'])['id'].count() 
#print(mult[mult>1])

laur_df.groupby('id').agg({'id': 'count', 'firstname': 'first', 'surname': 'first'}).reset_index(drop = True).sort_values(by = 'id', ascending = False)

Unnamed: 0,id,firstname,surname
421,3,International Committee of the Red Cross,
457,2,Office of the United Nations High Commissioner...,
135,2,Frederick,Sanger
542,2,Marie,Curie
602,2,John,Bardeen
...,...,...,...
319,1,Erwin,Schrödinger
320,1,Marshall W.,Nirenberg
321,1,Max,Delbrück
322,1,Alfred D.,Hershey


In [22]:
### Question 6: Which laureates were awarded prizes in multiple categories?
mult = laur_df.groupby(['id', 'firstname', 'surname', 'category'])['id'].count() 
print(mult[mult>1])
# Curie and Pauling

id   firstname  surname  category 
222  Frederick  Sanger   chemistry    2
66   John       Bardeen  physics      2
Name: id, dtype: int64


### 2. Twitter API Exercise  ###

In [23]:
import twython
from twython import Twython, TwythonError

In [24]:
# create a json file 'credentials.json' with a key
credentials = '''{
"consumer_key" : "49Q3KrSmilOzf8lYpFTPrvgGw",
"consumer_secret" : "ap5IMDKWZ5j55uaHfjL0hPsgyrzAVgCPhRnuhdplCEJu6OTVlU",
"access_token" : "34498057-YWPCQv0pgDiVOc2L2B6Wnwu3XH2IpuaZcljiSunOx", 
"access_token_secret": "KQwyJOFmdi6MpNFgnkpFnY6zgyObuE3c29qxQAcNPlLKB"
}
'''

In [25]:
#FIRST load from strings, deserialize, get python dict
dict_from_string = json.loads(credentials)
dict_from_string

{'consumer_key': '49Q3KrSmilOzf8lYpFTPrvgGw',
 'consumer_secret': 'ap5IMDKWZ5j55uaHfjL0hPsgyrzAVgCPhRnuhdplCEJu6OTVlU',
 'access_token': '34498057-YWPCQv0pgDiVOc2L2B6Wnwu3XH2IpuaZcljiSunOx',
 'access_token_secret': 'KQwyJOFmdi6MpNFgnkpFnY6zgyObuE3c29qxQAcNPlLKB'}

In [26]:
#save as json file (dump) - will be real json
with open("credentials.json", "w") as output:
    json.dump(dict_from_string, output)

In [27]:
with open("credentials.json") as infile:
    credentials = json.load(infile)

twitter_client = Twython(credentials["consumer_key"],
                         credentials["consumer_secret"],
                         credentials["access_token"],
                         credentials["access_token_secret"])

In [28]:
result = twitter_client.get_followers_ids(screen_name="_Propulsion", count=20)
print(result)

{'ids': [750701412779188224, 2188063405, 133615537, 1017072143371259904, 1156719981511548928, 1205835843216138240, 97264922, 1431513295, 2911167775, 1179075958776717312, 30476148, 3099230117, 247361789, 325468508, 903640970553643009, 552949393, 85383218, 1194897535145512960, 784067364274995200, 857072172], 'next_cursor': 1649675268104158550, 'next_cursor_str': '1649675268104158550', 'previous_cursor': 0, 'previous_cursor_str': '0', 'total_count': None}


In [29]:
result = twitter_client.show_user(screen_name='WSJ')

In [30]:
result

{'id': 3108351,
 'id_str': '3108351',
 'name': 'The Wall Street Journal',
 'screen_name': 'WSJ',
 'location': 'New York, NY',
 'profile_location': None,
 'description': 'Breaking news and features from https://t.co/GhhR6PLfem.',
 'url': 'https://t.co/GhhR6PLfem',
 'entities': {'url': {'urls': [{'url': 'https://t.co/GhhR6PLfem',
     'expanded_url': 'http://wsj.com',
     'display_url': 'wsj.com',
     'indices': [0, 23]}]},
  'description': {'urls': [{'url': 'https://t.co/GhhR6PLfem',
     'expanded_url': 'http://wsj.com',
     'display_url': 'wsj.com',
     'indices': [32, 55]}]}},
 'protected': False,
 'followers_count': 17249745,
 'friends_count': 1123,
 'listed_count': 114582,
 'created_at': 'Sun Apr 01 06:22:13 +0000 2007',
 'favourites_count': 1197,
 'utc_offset': None,
 'time_zone': None,
 'geo_enabled': True,
 'verified': True,
 'statuses_count': 298957,
 'lang': None,
 'status': {'created_at': 'Tue Feb 11 08:00:11 +0000 2020',
  'id': 1227140259114299393,
  'id_str': '12271402

In [31]:
### Question 1: Frequency of tweets (You’re free to choose a daily/weekly/monthly timeframe accordingly 
### to the case)


# function for loading extended timeline
def load_ext_user_timeline(twitter_client, screen_name):
    tweets = []
    try:
        user_timeline = twitter_client.get_user_timeline(screen_name=screen_name,count=200, tweet_mode='extended')
    except TwythonError as e:
        print(e)
    for tweet in user_timeline:
        # Add whatever you want from the tweet, here we just add the text
        tweets.append(tweet)
    # Count could be less than 200, see:
    # https://dev.twitter.com/discussions/7513
    while len(user_timeline) != 0: 
        try:
            user_timeline = twitter_client.get_user_timeline(
                screen_name=screen_name,count=200,max_id=user_timeline[len(user_timeline)-1]['id']-1, tweet_mode='extended')
        except TwythonError as e:
            print(e)
        for tweet in user_timeline:
            # Add whatever you want from the tweet, here we just add the text
            tweets.append(tweet)
    # Number of tweets the user has made
    print(len(tweets))
    return tweets

In [32]:
search_results = load_ext_user_timeline(twitter_client, screen_name='WSJ')
search_results

3229


[{'created_at': 'Tue Feb 11 08:00:11 +0000 2020',
  'id': 1227140259114299393,
  'id_str': '1227140259114299393',
  'full_text': 'The relationship between Carla DiBello, a former reality-TV producer, and the head of Saudi Arabia’s sovereign-wealth fund, has set off alarms in the kingdom, officials at the fund say https://t.co/2umJkKJfdH',
  'truncated': False,
  'display_text_range': [0, 208],
  'entities': {'hashtags': [],
   'symbols': [],
   'user_mentions': [],
   'urls': [{'url': 'https://t.co/2umJkKJfdH',
     'expanded_url': 'https://on.wsj.com/2UFukUx',
     'display_url': 'on.wsj.com/2UFukUx',
     'indices': [185, 208]}]},
  'source': '<a href="http://www.socialflow.com" rel="nofollow">SocialFlow</a>',
  'in_reply_to_status_id': None,
  'in_reply_to_status_id_str': None,
  'in_reply_to_user_id': None,
  'in_reply_to_user_id_str': None,
  'in_reply_to_screen_name': None,
  'user': {'id': 3108351,
   'id_str': '3108351',
   'name': 'The Wall Street Journal',
   'screen_name': '

In [33]:
### Question 2: Top 5 most used hashtags
search_results[0]

{'created_at': 'Tue Feb 11 08:00:11 +0000 2020',
 'id': 1227140259114299393,
 'id_str': '1227140259114299393',
 'full_text': 'The relationship between Carla DiBello, a former reality-TV producer, and the head of Saudi Arabia’s sovereign-wealth fund, has set off alarms in the kingdom, officials at the fund say https://t.co/2umJkKJfdH',
 'truncated': False,
 'display_text_range': [0, 208],
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [],
  'urls': [{'url': 'https://t.co/2umJkKJfdH',
    'expanded_url': 'https://on.wsj.com/2UFukUx',
    'display_url': 'on.wsj.com/2UFukUx',
    'indices': [185, 208]}]},
 'source': '<a href="http://www.socialflow.com" rel="nofollow">SocialFlow</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 3108351,
  'id_str': '3108351',
  'name': 'The Wall Street Journal',
  'screen_name': 'WSJ',
  'location': 'Ne

In [34]:
search_results[0]['entities'].keys()
# The key we are looking for is hashtags

dict_keys(['hashtags', 'symbols', 'user_mentions', 'urls'])

In [35]:
length = len(search_results)
list_hash = [None] * length
for tweet in range(0, length):
        if len(search_results[tweet]['entities']['hashtags']) != 0: # if list item not empty
            list_hash[tweet] = search_results[tweet]['entities']['hashtags'] # save it in list_hash
        pass

In [36]:
# Removing none values
list_hash = list(filter(None, list_hash)) 
list_hash[0] # We are interested in the "text" item of the dictionary

[{'text': 'WSJWhatsNow', 'indices': [127, 139]}]

In [55]:
type(list_hash[190][0]['text']) # which is a string

str

In [45]:
length = len(list_hash)
hashtags = [None] * length # empty list for hashtags
for i in range(0, length):
    hashtags[i] = list_hash[i][0]['text']

In [46]:
hashtags = pd.DataFrame(hashtags)
hashtags[0].value_counts().head()

WSJWhatsNow    205
DemDebate       18
Oscars          13
Grammys          8
SOTU             7
Name: 0, dtype: int64

In [47]:
### Excercise 3: Choose a relevant keyword and report how many times it has been used in the tweets 
#   that you scraped ( Ex: for Trump’s twitter you can search for “Wall” or “elections”)

In [48]:
word = "Trump"

In [49]:
search_results[0]["full_text"]

'The relationship between Carla DiBello, a former reality-TV producer, and the head of Saudi Arabia’s sovereign-wealth fund, has set off alarms in the kingdom, officials at the fund say https://t.co/2umJkKJfdH'

In [50]:
counter = 0
for i in range(0, length):
    if search_results[i]["full_text"].find(word) > -1:
        counter = counter + 1
counter

18

In [51]:
hashtags

Unnamed: 0,0
0,WSJWhatsNow
1,WSJWhatsNow
2,WSJWhatsNow
3,WSJWhatsNow
4,WSJWhatsNow
...,...
277,OnceUponaTimeinHollywood
278,goldenglobes2020
279,MeToo
280,WSJWhatsNow
