Here I'll be exploring the dataset based on the users tagged in certain posts. Working with this spin-off of the original dataset, see how easy it is to predict whether or not a certain tagged profile and thus inclusion in that post means it is more or less likely to receive more engagements. 

# In the block below, the remainder of this notebook are condensed into a single function. 

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('husl')
%matplotlib inline


def make_num(z):
    """
    Helper function for  the scraping tool. The result of a follower pull is 
    
    "35m followers", which this function will convert into 35 
    
    Parameters
    ----------
    
    z : str
        The follower count, given as "46m followers"
        
    Returns
    -------
    
    num : float
        The converted number, or 4.6e6. 
        
        Note this may occasionally return NaN, which means the account tagged was not actually found. 
    
    """
    
    import re
    from numpy import nan
    if type(z) == type(nan):
        return
    
    z = z.split()[0] #not the followers part
    z = z.replace(',','')
    num = re.split(r'[a-z]*',z)[0]
    
    order = "".join(re.findall(r'[a-z]*',z))
    if order == 'm':
        num = float(num) * 1e6
    if order == 'k':
        num = float(num) * 1e3
    else:
        num = float(num)
    
   # print("Followers = ", num)
    return num

def get_followers(z,driver):
    """
    Obtains follower count for profile in profile df. 
    
    Parameters
    ----------
    
    z : str
        The account name, done via an apply statement. 
        
    driver : WebDriver
        Selenium webdriver object, used to naviagate to the URL of each account, and then grab the corresponding
        followers. 
        
    Returns 
    -------
    
    followers : float
        The number of followers of that particular user. 

    """
    
    from time import sleep
    from random import randint
    z = z.replace('@','')
    url =  driver.get("https://www.instagram.com/"+ z + "/?hl=en")
    sleep(randint(1,3))
    hrefs_in_view = driver.find_elements_by_tag_name('a')
    
    #this gives the link that has the attached follower number. 
    for elem in hrefs_in_view:
        if elem.get_attribute('href') ==  "https://www.instagram.com/accounts/login/?next=%2F"+z+"%2Ffollowers%2F&source=followed_by_list": #'https://www.instagram.com/'+z+'/followers/':
            
            followers = make_num(elem.text)
            
            return followers
              
def load_tagged_profiles(instas):
    """
    
    Creates a dataset summarizing the instagram accounts tagged in the post. 
    The columns used are a temporary list, and this line will be finalized once it is. 
    
    
    Parameters
    ----------
    
    instas : df
        Dataframe of the instagram posts, their associated information + output of engagements
        
        
    Returns
    -------
    
    
    profile_df : df
    
        Dataframe of the profiles tagged in any or multiple posts. 
        
    """
    
    #use regular expression to obtain any time a name is used, find all
    import re
    posts = " ".join(instas['Description'].unique()).lower() #combine into a corpus, isolating each tag. 
    tagged_profiles = list( dict.fromkeys(re.findall('@[a-z_0-9]*',posts)) ) 
    d = []
    instas['Description'] = instas['Description'].str.lower()
    for account in tagged_profiles:
        posts_w_account = instas.loc[instas['Description'].str.contains(account)]
        max_eng = posts_w_account.Engagements.max()
        tot_eng = posts_w_account.Engagements.sum()
        n_posts = posts_w_account.Engagements.count()
        min_eng = posts_w_account.Engagements.min()
        avg_eng = posts_w_account.Engagements.mean()
        std_eng = posts_w_account.Engagements.std()
        d.append({'profile': account, 'max_eng': max_eng,'tot_eng' : tot_eng, 
                 'n_posts' : n_posts, 'min_eng' : min_eng, 'avg_eng' : avg_eng, 'std_eng' : std_eng})
   
    profile_df = pd.DataFrame(d)
    #but we're not done, can also establish how popular these accounts are, doing it based on follower count. 
    from selenium import webdriver
    driver = webdriver.Chrome('/Users/noahkasmanoff/Desktop/chromedriver') #open up chrome/spotify
    profile_df['followers']  = profile_df['profile'].apply(lambda z: get_followers(z,driver))
    driver.close()
    
    return profile_df

instas = pd.read_csv('Business Analytics/training_set.csv',encoding = 'unicode_escape')

instas.dropna(inplace=True)


profile_df = load_tagged_profiles(instas)

# create a dataset of the profile, and use it to infer information to add back to the training set regarding that profile tagged in a post.

In [None]:
profile_df.isna().sum()

In [None]:
np.log10(profile_df.loc[profile_df['followers'] > 0 ]['followers']).plot(kind='hist')

So ignoring the faulty entries, the # of followers obeys some kind of distribution, which may be useful. 

In [None]:
sns.jointplot(x='followers',y='max_eng',data=np.log10(profile_df.loc[profile_df['followers'] > 0 ][['followers','max_eng']]),kind='hex')

In [None]:
p = pd.read_csv('Business Analytics/profile_df.csv')

In [None]:
print(p.iloc[[7]].profile.values[0].replace('@','').replace(' ',''))

In [None]:
profile_df.to_csv('Business Analytics/tagged_profiles.csv',index=False)

From https://www.adweek.com/digital/instagram-nba-players-teams-february-2017/, here's a list of stars and also most popular players. Perhaps this affects how many people view a post? 

# Now clusters? 

Will elbow method help illuminate which users are stars and which aren't?

In [None]:
profile_df = pd.read_csv('Business Analytics/tagged_profiles.csv')

In [None]:
profile_df.drop('tot_eng',axis=1,inplace=True)
profile_df.dropna(subset=['followers'],inplace=True)
profile_df.drop('std_eng',axis=1,inplace=True)

In [None]:
sns.pairplot(profile_df)

In [None]:
profile_df['followers'] = np.log10(profile_df['followers'])
profile_df['n_posts'] = np.log10(profile_df['n_posts'])

In [None]:
sns.pairplot(profile_df)

In [None]:
profile_df

In [None]:
from sklearn.cluster import KMeans
from sklearn import preprocessing

x = profile_df.drop(columns=['profile','std_eng']).dropna().values #returns a numpy array #removes missing followers (broken accounts)

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)


In [None]:
profile_df

In [None]:
distortions = []
K = range(2,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(df)
    distortions.append(kmeanModel.inertia_)

In [None]:
plt.plot(K,distortions,'o-')

In [None]:
kmeanModel = KMeans(n_clusters=3).fit(df)

In [None]:
kmeanModel.labels_

In [None]:
profile_df.dropna(subset=['followers'],inplace=True)
profile_df['tempCluster'] = kmeanModel.labels_

In [None]:
sns.pairplot(profile_df,hue='tempCluster')