Here I'll be exploring the dataset based on the users tagged in certain posts. Working with this spin-off of the original dataset, see how easy it is to predict whether or not a certain tagged profile and thus inclusion in that post means it is more or less likely to receive more engagements. 

## Need to find how to include t.snell for instance... not @stephencurry30..., just the digits no elipsis?

# In the block below, the remainder of this notebook are condensed into a single function. 

In [6]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('husl')
%matplotlib inline

In [7]:
def make_num(z):
    """
    Helper function for  the scraping tool. The result of a follower pull is 
    
    "35m followers", which this function will convert into 35 
    
    Parameters
    ----------
    
    z : str
        The follower count, given as "46m followers"
        
    Returns
    -------
    
    num : float
        The converted number, or 4.6e6. 
        
        Note this may occasionally return NaN, which means the account tagged was not actually found. 
    
    """
    
    import re
    from numpy import nan
    if type(z) == type(nan):
        return
    
    z = z.split()[0] #not the followers part
    z = z.replace(',','')
    num = re.split(r'[a-z]*',z)[0]
    
    order = "".join(re.findall(r'[a-z]*',z))
    if order == 'm':
        num = float(num) * 1e6
    if order == 'k':
        num = float(num) * 1e3
    else:
        num = float(num)
    
   # print("Followers = ", num)
    return num

def get_followers(z,driver):
    """
    Obtains follower count for profile in profile df. 
    
    Parameters
    ----------
    
    z : str
        The account name, done via an apply statement. 
        
    driver : WebDriver
        Selenium webdriver object, used to naviagate to the URL of each account, and then grab the corresponding
        followers. 
        
    Returns 
    -------
    
    followers : float
        The number of followers of that particular user. 

    """
    
    from time import sleep
    from random import randint
    z = z.replace('@','')
    url =  driver.get("https://www.instagram.com/"+ z + "/?hl=en")
    sleep(randint(1,3))
    hrefs_in_view = driver.find_elements_by_tag_name('a')
    
    #this gives the link that has the attached follower number. 
    for elem in hrefs_in_view:
        if elem.get_attribute('href') ==  "https://www.instagram.com/accounts/login/?next=%2F"+z+"%2Ffollowers%2F&source=followed_by_list": #'https://www.instagram.com/'+z+'/followers/':
            
            followers = make_num(elem.text)
            
            return followers
              
def load_tagged_profiles(instas):
    """
    
    Creates a dataset summarizing the instagram accounts tagged in the post. 
    The columns used are a temporary list, and this line will be finalized once it is. 
    
    
    Parameters
    ----------
    
    instas : df
        Dataframe of the instagram posts, their associated information + output of engagements
        
        
    Returns
    -------
    
    
    profile_df : df
    
        Dataframe of the profiles tagged in any or multiple posts. 
        
    """
    
    #use regular expression to obtain any time a name is used, find all
    import re
    posts = " ".join(instas['Description'].unique()).lower() #combine into a corpus, isolating each tag. 
    tagged_profiles = list( dict.fromkeys(re.findall('@[a-z_0-9]*\.?[a-z_0-9]*?',posts)) ) 
    tagged_profiles = [s.replace('.','') if s[-1] == '.' else s for s in tagged_profiles]
    d = []
    instas['Description'] = instas['Description'].str.lower()
    for account in tagged_profiles:
        if account[-1] == '.':
            account = account[:-1] #remove the period accidentally taken at the end. 
        posts_w_account = instas.loc[instas['Description'].str.contains(account)]
        max_eng = posts_w_account.Engagements.max()
        tot_eng = posts_w_account.Engagements.sum()
        n_posts = posts_w_account.Engagements.count()
        min_eng = posts_w_account.Engagements.min()
        avg_eng = posts_w_account.Engagements.mean()
        d.append({'profile': account, 'max_eng': max_eng,'tot_eng' : tot_eng, 
                 'n_posts' : n_posts, 'min_eng' : min_eng, 'avg_eng' : avg_eng})
   
    profile_df = pd.DataFrame(d)
    profile_df.drop_duplicates(inplace=True)

    #but we're not done, can also establish how popular these accounts are, doing it based on follower count. 
   # from selenium import webdriver
   # driver = webdriver.Chrome('/Users/noahkasmanoff/Desktop/chromedriver') #open up chrome/spotify
   # profile_df['followers']  = profile_df['profile'].apply(lambda z: get_followers(z,driver))
   # driver.close()
    return profile_df

instas = pd.read_csv('Business Analytics/training_set.csv',encoding = 'unicode_escape')

#instas.dropna(inplace=True)

In [9]:
instas.fillna('',inplace=True)

In [10]:
instas

Unnamed: 0,Engagements,Followers at Posting,Created,Type,Description
0,502093,36984682,2019-05-21 23:30:51 EDT,Video,The @raptors bench trio of @sergeibaka @norman...
1,603380,36984682,2019-05-21 22:53:33 EDT,Video,@kyle_lowry7 pulls from deep for the @raptors ...
2,603380,36984682,2019-05-21 22:19:58 EDT,Video,@k_mid22 with some english on the @bucks dime!
3,725100,36984682,2019-05-21 22:02:41 EDT,Video,Kawhi punches it home with the left on TNT!
4,661446,36984682,2019-05-21 20:47:49 EDT,Video,@giannis_an34 goes baseline early to rock the ...
5,322444,36955156,2019-05-21 14:02:32 EDT,Photo,Congrats to the 2018-19 NBA All-Rookie First T...
6,722540,36955156,2019-05-21 12:30:53 EDT,Video,The @warriors locked in for four games to adva...
7,339265,36955156,2019-05-21 08:59:02 EDT,Photo,Will the @bucks take a commanding 3-1 lead... ...
8,443330,36955156,2019-05-21 00:48:16 EDT,Photo,TAP to SHOP the @warriors Western Conference C...
9,652193,36955156,2019-05-21 00:38:18 EDT,Video,Hey @stephencurry30... that was the first time...


In [4]:
profile_df = load_tagged_profiles(instas)

In [5]:
profile_df

Unnamed: 0,avg_eng,max_eng,min_eng,n_posts,profile,tot_eng
0,481730.119266,848228,198217,218,@raptors,105017166
1,541361.461538,848228,127816,13,@sergeibaka,7037699
2,633178.625000,723934,502093,8,@normanpowell4,5065429
3,601731.142857,696478,502093,7,@fredvanvleet,4212118
4,587078.422222,854893,280401,45,@kyle_lowry7,26418529
5,569430.888889,706452,154757,9,@k_mid22,5124878
6,487729.994898,810070,217833,196,@bucks,95595079
7,628804.266667,956368,253363,195,@giannis_an34,122616832
8,603939.379310,941443,304981,87,@lukadoncic,52542726
9,616215.732143,898784,287156,56,@traeyoung,34508081


In [None]:
profile_df[profile_df['profile'].duplicated(keep=False)].sort_values('profile')

# create a dataset of the profile, and use it to infer information to add back to the training set regarding that profile tagged in a post.

In [None]:
profile_df.sort_values('n_posts')

So ignoring the faulty entries, the # of followers obeys some kind of distribution, which may be useful. 

In [None]:
profile_df = pd.read_csv('Business Analytics/tagged_profiles.csv')
profile_df.drop('std_eng',axis=1,inplace=True)

In [None]:
profile_df.shape

In [None]:
profile_df.dropna(inplace=True)

From https://www.adweek.com/digital/instagram-nba-players-teams-february-2017/, here's a list of stars and also most popular players. Perhaps this affects how many people view a post? 

# Now clusters? 

Will elbow method help illuminate which users are stars and which aren't?

In [None]:
profile_df.drop('tot_eng',axis=1,inplace=True)


In [None]:
profile_df

In [None]:
profile_df['followers'] = np.log10(profile_df['followers']+1)
profile_df['n_posts'] = np.log10(profile_df['n_posts']+1)

In [None]:
sns.pairplot(profile_df)

In [None]:
profile_df

In [None]:
from sklearn.cluster import KMeans
from sklearn import preprocessing

x = profile_df.drop('profile',axis=1).values #returns a numpy array #removes missing followers (broken accounts)

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)


In [None]:
distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(df[4].values.reshape(-1,1))
    distortions.append(kmeanModel.inertia_)

In [None]:
plt.plot(K,distortions,'o-')

In [None]:
kmeanModel = KMeans(n_clusters=4).fit(df[4].values.reshape(-1,1))

In [None]:
kmeanModel.labels_

In [None]:
profile_df.dropna(subset=['followers'],inplace=True)
profile_df['tempCluster'] = kmeanModel.labels_

In [None]:
sns.pairplot(profile_df,hue='tempCluster',diag_kind='kde')

In [None]:
profiles = profile_df[['profile','tempCluster']].copy()

In [None]:
profiles = pd.get_dummies(profiles,columns=['tempCluster'])

In [None]:
def apply_clusters_to_training(z,profiles):
    """Apply the clusters to the dataset, now in a form for prediction aligned with the other data. 
   
   """
    
    import re
    from numpy import zeros
    post_clusters = zeros(4)
    tags_in_z =  list( dict.fromkeys(re.findall('@[a-z_0-9]*',z)) ) 
    
    if len(tags_in_z) == 0:
        return post_clusters
    print(z)
    print(tags_in_z)
    for tag_in_z in tags_in_z:
        try:
            post_clusters = post_clusters + profiles[profiles['profile'] == tag_in_z].values[0][1:] 
        except: 
            pass
    return post_clusters

In [None]:
instas = pd.read_csv('Business Analytics/training_set.csv',encoding = 'unicode_escape')

instas.dropna(inplace=True)

instas['tempClusters'] = instas['Description'].str.lower().apply(lambda z: apply_clusters_to_training(z,profiles))


In [None]:
clusters = pd.DataFrame(instas.tempClusters.tolist(), columns=['profiles_nC1', 'profiles_nC2','profiles_nC3','profiles_nC4'])

In [None]:
instas.merge(clusters,left_index=True,right_index=True).drop('tempClusters',axis=1)

In [None]:
instas