# Instagram Scraping 

# Data Analysis - Part I - Landscape of 50 most followed profiles

In [1]:
import pandas as pd
import numpy as np

In [4]:
most_followed = pd.read_csv('most_followed.csv', delimiter=';')

In [5]:
most_followed.head()

Unnamed: 0,id,account_username,real_name,link_to_profile,number_of_followers,number_of_posts,gender,real_person,profession
0,1,instagram,instagram,https://www.instagram.com/instagram/,329000000,6198,,f,
1,2,cristiano,Cristiano Ronaldo,https://www.instagram.com/cristiano/,198000000,2746,male,t,football player
2,3,arianagrande,Ariana Grande,https://www.instagram.com/arianagrande/,172000000,4480,female,t,singer
3,4,therock,Dwayne Johnson,https://www.instagram.com/therock/,169000000,4744,male,t,actor
4,5,selenagomez,Selena Gomez,https://www.instagram.com/selenagomez/,166000000,1576,female,t,singer


In [6]:
most_followed.tail()

Unnamed: 0,id,account_username,real_name,link_to_profile,number_of_followers,number_of_posts,gender,real_person,profession
45,46,maluma,Maluma,https://www.instagram.com/maluma/,48700000,7470,male,t,singer
46,47,priyankachopra,Priyanka Chopra Jonas,https://www.instagram.com/priyankachopra/,48600000,3036,female,t,actor
47,48,viratkohli,Virat Kohli,https://www.instagram.com/virat.kohli/,48100000,916,male,t,cricketer
48,49,camila_cabello,Camila Cabello,https://www.instagram.com/camila_cabello/,46600000,2309,female,t,singer
49,50,marvel,Marvel Cinematic Universe,https://www.instagram.com/marvel/,45000000,3978,,f,company


## How many profiles where taken into consideration?

In [12]:
most_followed['id'].count()

50

## Which one has the most followers?

In [18]:
most = most_followed.sort_values(by=['number_of_followers'], ascending=False)

In [19]:
most.head()

Unnamed: 0,id,account_username,real_name,link_to_profile,number_of_followers,number_of_posts,gender,real_person,profession
0,1,instagram,instagram,https://www.instagram.com/instagram/,329000000,6198,,f,
1,2,cristiano,Cristiano Ronaldo,https://www.instagram.com/cristiano/,198000000,2746,male,t,football player
2,3,arianagrande,Ariana Grande,https://www.instagram.com/arianagrande/,172000000,4480,female,t,singer
3,4,therock,Dwayne Johnson,https://www.instagram.com/therock/,169000000,4744,male,t,actor
4,5,selenagomez,Selena Gomez,https://www.instagram.com/selenagomez/,166000000,1576,female,t,singer


In [20]:
# for readable numbers

In [21]:
most_followed['followers_in_millions'] = most_followed['number_of_followers'] / 1000000

In [22]:
most_followed.head()

Unnamed: 0,id,account_username,real_name,link_to_profile,number_of_followers,number_of_posts,gender,real_person,profession,followers_in_millions
0,1,instagram,instagram,https://www.instagram.com/instagram/,329000000,6198,,f,,329.0
1,2,cristiano,Cristiano Ronaldo,https://www.instagram.com/cristiano/,198000000,2746,male,t,football player,198.0
2,3,arianagrande,Ariana Grande,https://www.instagram.com/arianagrande/,172000000,4480,female,t,singer,172.0
3,4,therock,Dwayne Johnson,https://www.instagram.com/therock/,169000000,4744,male,t,actor,169.0
4,5,selenagomez,Selena Gomez,https://www.instagram.com/selenagomez/,166000000,1576,female,t,singer,166.0


In [23]:
most_readable = most_followed.sort_values(by=['followers_in_millions'], ascending=False)

In [24]:
most_readable.head()

Unnamed: 0,id,account_username,real_name,link_to_profile,number_of_followers,number_of_posts,gender,real_person,profession,followers_in_millions
0,1,instagram,instagram,https://www.instagram.com/instagram/,329000000,6198,,f,,329.0
1,2,cristiano,Cristiano Ronaldo,https://www.instagram.com/cristiano/,198000000,2746,male,t,football player,198.0
2,3,arianagrande,Ariana Grande,https://www.instagram.com/arianagrande/,172000000,4480,female,t,singer,172.0
3,4,therock,Dwayne Johnson,https://www.instagram.com/therock/,169000000,4744,male,t,actor,169.0
4,5,selenagomez,Selena Gomez,https://www.instagram.com/selenagomez/,166000000,1576,female,t,singer,166.0


In [25]:
most_readable.tail()

Unnamed: 0,id,account_username,real_name,link_to_profile,number_of_followers,number_of_posts,gender,real_person,profession,followers_in_millions
45,46,maluma,Maluma,https://www.instagram.com/maluma/,48700000,7470,male,t,singer,48.7
46,47,priyankachopra,Priyanka Chopra Jonas,https://www.instagram.com/priyankachopra/,48600000,3036,female,t,actor,48.6
47,48,viratkohli,Virat Kohli,https://www.instagram.com/virat.kohli/,48100000,916,male,t,cricketer,48.1
48,49,camila_cabello,Camila Cabello,https://www.instagram.com/camila_cabello/,46600000,2309,female,t,singer,46.6
49,50,marvel,Marvel Cinematic Universe,https://www.instagram.com/marvel/,45000000,3978,,f,company,45.0


## Sum of all followers together

In [28]:
most_followed['number_of_followers'].sum()

4715700000

In [29]:
most_readable['followers_in_millions'].sum()

4715.7

In [30]:
# in billions:

In [60]:
most_readable['followers_in_millions'].sum() / 1000

4.7157

## Which gender has most followers?

In [33]:
most_followed.groupby('gender').count()

Unnamed: 0_level_0,id,account_username,real_name,link_to_profile,number_of_followers,number_of_posts,real_person,profession,followers_in_millions
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
female,24,24,24,24,24,24,24,24,24
male,16,16,16,16,16,16,16,16,16


In [36]:
most_followed.groupby('gender').sum()

Unnamed: 0_level_0,id,number_of_followers,number_of_posts,followers_in_millions
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,553,2325500000,83631,2325.5
male,437,1410600000,51733,1410.6


## Which profession has the most followers?

In [58]:
most_followed.groupby('profession').sum().sort_values('number_of_followers', ascending=False)

Unnamed: 0_level_0,id,number_of_followers,number_of_posts,followers_in_millions
profession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
singer,445,1660800000,45102,1660.8
football player,96,582200000,11474,582.2
media personality,51,504000000,14100,504.0
actor,146,418200000,13355,418.2
rapper,80,230200000,11952,230.2
football team,87,219500000,26102,219.5
model,57,172300000,33102,172.3
company,78,113900000,4534,113.9
TV show,25,82400000,8406,82.4
actress,29,65000000,3422,65.0


# Part II - Analysis of posts from the most followed

In [96]:
post_details = pd.read_csv('post_details.csv', delimiter=';')

In [97]:
post_details.head()

Unnamed: 0,id,url,post_type,likes,time_posted,caption,user_name
0,4,https://www.instagram.com/p/B7q5Qxyg8FZ/,video,2.552.871,2020-01-23 18:18:03+01,Karen Ip (@fruitypoppin) knows how it feels to...,instagram
1,5,https://www.instagram.com/p/B7ocUPsAq1_/,video,7.456.046,2020-01-22 19:26:12+01,Adrian Steckeweh (@omega.c) has a warped sense...,instagram
2,6,https://www.instagram.com/p/B7lwdBHAwIw/,photo,913.456,2020-01-21 18:23:47+01,"As one of Japan’s top competition boulderers, ...",instagram
3,7,https://www.instagram.com/p/B7jMF3dg8RH/,photo,621.529,2020-01-20 18:27:35+01,Parker Kit Hill (@parkerkithill) refuses to be...,instagram
4,8,https://www.instagram.com/p/B7gtmEJA3Oc/,photo,1.563.593,2020-01-19 19:22:37+01,"#HelloFrom Taimu Mountain in Fujian, China, wh...",instagram


## Which type of post receives the most likes?

In [116]:
post_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2504 entries, 0 to 2503
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           2504 non-null   int64 
 1   url          2504 non-null   object
 2   post_type    2504 non-null   object
 3   likes        2504 non-null   int64 
 4   time_posted  2504 non-null   object
 5   caption      2394 non-null   object
 6   user_name    2504 non-null   object
dtypes: int64(2), object(5)
memory usage: 137.1+ KB


In [98]:
photo = post_details['post_type'] == 'photo'

In [99]:
post_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2504 entries, 0 to 2503
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           2504 non-null   int64 
 1   url          2504 non-null   object
 2   post_type    2504 non-null   object
 3   likes        2504 non-null   object
 4   time_posted  2504 non-null   object
 5   caption      2394 non-null   object
 6   user_name    2504 non-null   object
dtypes: int64(1), object(6)
memory usage: 137.1+ KB


In [100]:
post_details['likes'] = post_details.likes.astype(str)

In [101]:
post_details.head()

Unnamed: 0,id,url,post_type,likes,time_posted,caption,user_name
0,4,https://www.instagram.com/p/B7q5Qxyg8FZ/,video,2.552.871,2020-01-23 18:18:03+01,Karen Ip (@fruitypoppin) knows how it feels to...,instagram
1,5,https://www.instagram.com/p/B7ocUPsAq1_/,video,7.456.046,2020-01-22 19:26:12+01,Adrian Steckeweh (@omega.c) has a warped sense...,instagram
2,6,https://www.instagram.com/p/B7lwdBHAwIw/,photo,913.456,2020-01-21 18:23:47+01,"As one of Japan’s top competition boulderers, ...",instagram
3,7,https://www.instagram.com/p/B7jMF3dg8RH/,photo,621.529,2020-01-20 18:27:35+01,Parker Kit Hill (@parkerkithill) refuses to be...,instagram
4,8,https://www.instagram.com/p/B7gtmEJA3Oc/,photo,1.563.593,2020-01-19 19:22:37+01,"#HelloFrom Taimu Mountain in Fujian, China, wh...",instagram


In [107]:
post_details['likes'].apply(lambda x: x.replace(',',''))

0       2.552.871
1       7.456.046
2         913.456
3         621.529
4       1.563.593
          ...    
2499    4.226.752
2500      467.932
2501    1.147.768
2502      714.277
2503      241.925
Name: likes, Length: 2504, dtype: object

In [108]:
post_details['likes'] = (post_details['likes'].replace('\.','', regex=True)
                        .astype(int))

In [109]:
post_details.head()

Unnamed: 0,id,url,post_type,likes,time_posted,caption,user_name
0,4,https://www.instagram.com/p/B7q5Qxyg8FZ/,video,2552871,2020-01-23 18:18:03+01,Karen Ip (@fruitypoppin) knows how it feels to...,instagram
1,5,https://www.instagram.com/p/B7ocUPsAq1_/,video,7456046,2020-01-22 19:26:12+01,Adrian Steckeweh (@omega.c) has a warped sense...,instagram
2,6,https://www.instagram.com/p/B7lwdBHAwIw/,photo,913456,2020-01-21 18:23:47+01,"As one of Japan’s top competition boulderers, ...",instagram
3,7,https://www.instagram.com/p/B7jMF3dg8RH/,photo,621529,2020-01-20 18:27:35+01,Parker Kit Hill (@parkerkithill) refuses to be...,instagram
4,8,https://www.instagram.com/p/B7gtmEJA3Oc/,photo,1563593,2020-01-19 19:22:37+01,"#HelloFrom Taimu Mountain in Fujian, China, wh...",instagram


In [112]:
post_details.groupby('post_type').sum()

Unnamed: 0_level_0,id,likes
post_type,Unnamed: 1_level_1,Unnamed: 2_level_1
photo,2295448,3774093314
video,911933,3547407445


In [113]:
photo = post_details['post_type']=='photo'

In [115]:
post_details.where(photo).count()

id             1833
url            1833
post_type      1833
likes          1833
time_posted    1833
caption        1734
user_name      1833
dtype: int64

In [120]:
average_on_photo = 3774093314 /1833

In [121]:
average_on_photo

2058970.711402073

In [122]:
video = post_details['post_type']=='video'

In [123]:
post_details.where(video).count()

id             671
url            671
post_type      671
likes          671
time_posted    671
caption        660
user_name      671
dtype: int64

In [124]:
average_on_video = 3547407445 /671

In [125]:
average_on_video

5286747.309985097

In [126]:
post_details.likes.min()

62442

In [127]:
post_details.likes.max()

44834774

In [128]:
post_details.likes.mean()

2923922.0283546327

In [129]:
post_details.likes.median()

1817171.0

In [154]:
post_details.where(post_details['post_type']=='photo').sort_values('likes', ascending=False).head()

Unnamed: 0,id,url,post_type,likes,time_posted,caption,user_name
2161,2187.0,https://www.instagram.com/p/B7JpPbgFOet/,photo,12660083.0,2020-01-10 20:22:02+01,been gone,billieeilish
282,312.0,https://www.instagram.com/p/B6rH9Qvn8aV/,photo,12346704.0,2019-12-29 23:53:59+01,just didn’t feel right going into 2020 without...,kyliejenner
270,300.0,https://www.instagram.com/p/B7CsNZvHKEk/,photo,11632637.0,2020-01-08 03:33:17+01,Throwback🤰🏻pregnant with my baby girl. I can’t...,kyliejenner
287,317.0,https://www.instagram.com/p/B6gm1aKnqHZ/,photo,11379957.0,2019-12-25 21:52:09+01,Merry Christmas 🎁,kyliejenner
224,254.0,https://www.instagram.com/p/B5RY86CDHxC/,photo,11294608.0,2019-11-25 03:30:50+01,Feels good to be back. Thank you to my entire ...,selenagomez


## Best performing photo has Billie Eilish - https://www.instagram.com/p/B7JpPbgFOet/ - number 45 on the list of most popular profiles:

In [157]:
post_details.iloc[2161]

id                                                 2187
url            https://www.instagram.com/p/B7JpPbgFOet/
post_type                                         photo
likes                                          12660083
time_posted                      2020-01-10 20:22:02+01
caption                                       been gone
user_name                                  billieeilish
Name: 2161, dtype: object

In [152]:
post_details.where(post_details['post_type']=='video').sort_values('likes', ascending=False).head()

Unnamed: 0,id,url,post_type,likes,time_posted,caption,user_name
46,81.0,https://www.instagram.com/p/B6RNPBsg1wo/,video,44834774.0,2019-12-19 22:20:05+01,✈️,instagram
84,88.0,https://www.instagram.com/p/B55gk8DAL3Z/,video,38030981.0,2019-12-10 18:09:43+01,How can i not fall in love with my sweet princ...,cristiano
71,73.0,https://www.instagram.com/p/B6pv5lbAjfK/,video,33935130.0,2019-12-29 11:05:23+01,It was a pleasure to meet you @ali_amir_happy ...,cristiano
2165,2191.0,https://www.instagram.com/p/B5ssK06lKvw/,video,32896854.0,2019-12-05 18:00:50+01,“xanny” VIDEO OUT NOW! DIRECTED BY MEEE😁 GO WA...,billieeilish
2163,2189.0,https://www.instagram.com/p/B6M0NYiF6rJ/,video,30704514.0,2019-12-18 05:24:38+01,18 tomorrow,billieeilish


## Best performing video belongs to Cristiano - https://www.instagram.com/p/B6RNPBsg1wo/ - He anyways has a profile with the most followers:

In [155]:
post_details.iloc[46]

id                                                   81
url            https://www.instagram.com/p/B6RNPBsg1wo/
post_type                                         video
likes                                          44834774
time_posted                      2019-12-19 22:20:05+01
caption                                              ✈️
user_name                                     instagram
Name: 46, dtype: object

In [172]:
post_details = pd.read_csv('post_details.csv', delimiter=';')

In [178]:
post_details['timezone'] = post_details['time_posted'][17:]

In [179]:
post_details.head()

Unnamed: 0,id,url,post_type,likes,time_posted,caption,user_name,timezone
0,4,https://www.instagram.com/p/B7q5Qxyg8FZ/,video,2.552.871,2020-01-23 18:18:03+01,Karen Ip (@fruitypoppin) knows how it feels to...,instagram,
1,5,https://www.instagram.com/p/B7ocUPsAq1_/,video,7.456.046,2020-01-22 19:26:12+01,Adrian Steckeweh (@omega.c) has a warped sense...,instagram,
2,6,https://www.instagram.com/p/B7lwdBHAwIw/,photo,913.456,2020-01-21 18:23:47+01,"As one of Japan’s top competition boulderers, ...",instagram,
3,7,https://www.instagram.com/p/B7jMF3dg8RH/,photo,621.529,2020-01-20 18:27:35+01,Parker Kit Hill (@parkerkithill) refuses to be...,instagram,
4,8,https://www.instagram.com/p/B7gtmEJA3Oc/,photo,1.563.593,2020-01-19 19:22:37+01,"#HelloFrom Taimu Mountain in Fujian, China, wh...",instagram,
