In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import nltk
import psycopg2
from nltk.corpus import stopwords
import re
from itertools import chain 
import math
import time
import logging 
import requests
import json
print("Imported all packages.")
print("Loading GoogleNews...")
from gensim import models
w = models.KeyedVectors.load_word2vec_format(r"F:\Pretrained Models\GoogleNews-vectors-negative300.bin.gz", binary=True, limit=2100000)
print("Loaded GoogleNews!")

Imported all packages.
Loading GoogleNews...
Loaded GoogleNews!


In [2]:
categories = ['food', 'fashion', 'makeup', 'beauty', 'lifestyle','luxury', 'travel','photography','fitness','sports','gaming', 'entertainment', 'technology','investment','education', 'animal', 'health', 'inspiration']
categories_list = [['food', 'recipes', 'recipe', 'cooking', 'fried'],
              ['fashion', 'outfit', 'clothes', 'menswear','wear'],
              ['makeup', 'eyeliner', 'bridal', 'shades', 'airbrush'],
              ['beauty', 'pimple', 'skin', 'oil', 'hair'],
              ['lifestyle', 'life', 'class', 'style', 'happy'],
              ['luxury', 'chic', 'handbag', 'stylish', 'brand'],
              ['travel', 'world', 'destination', 'adventure', 'landscapes'],
              ['photography', 'photo', 'editing', 'creative', 'artist'],
              ['fitness', 'nutrition', 'workout', 'healthy', 'exercise'],
              ['sport', 'sports'],
              ['gaming', 'gamer', 'fun', 'games', 'stream'],
              ['entertainment', 'movies', 'series', 'network', 'comedy'],
              ['technology', 'tech', 'geek', 'smartphones', 'mobiles'],
              ['investment', 'financial', 'stocks', 'market', 'trade'],
              ['education', 'lectures', 'competitive', 'exams', 'coaching'],
              ['animal', 'habitat', 'wild', 'documentary', 'nature'],
              ['health', 'medical', 'prevent', 'treat', 'heal'],
              ['psychology', 'motivation', 'inspire', 'happiness', 'mind','spiritual']]
col_name = ['user_id','url','Food','Fashion', 'Makeup', 'Beauty', 'Lifestyle','Luxury', 'Travel','Photography','Fitness','Sports','Gaming', 'Entertainment', 'Gadgets & Tech','Finance','Education', 'Animal/Pet', 'Health', 'Self Improvement','Art', 'Parenting', 'Books', 'genuinity_score','top keywords']
API_categories = ['Food','Fashion', 'Makeup', 'Beauty', 'Lifestyle','Luxury', 'Travel','Photography','Fitness','Sports','Gaming', 'Entertainment', 'Gadgets & Tech','Finance','Education', 'Animal/Pet', 'Health','Art', 'Self Improvement', 'Parenting', 'Books']

In [5]:
#Frame pre-processing function
def process(array,avoidwords):
    hashes = len(re.findall(r'#',str(array))) #counting hashtags
    text = re.sub(r'\[[0-9]*\]',' ',str(array))  #Remove Numbers
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text) # Remove nums
    text = re.sub(r'\s+',' ',text)  #Remove extra space
    text = re.sub(r"[^a-zA-Z0-9]+",' ',text)  #Remove special characters
    text = text.lower()  #Lower case all
    text = nltk.sent_tokenize(text)  #Tokenize to sentences 
    keywords = [nltk.word_tokenize(sentence) for sentence in text]
    raw_cap = len(keywords[0]) # Total number of words in caption
    stop_words = stopwords.words('english')
    stop_words.extend(avoidwords)
    for i in range(len(keywords)):
        keywords[i] = [word for word in keywords[i] if word not in stop_words]
    genuinity_percent = (raw_cap-hashes)*100/raw_cap
    return keywords,genuinity_percent


# normalize() -> given an array, converts to 1/0, top int(pos) will be 1
def normalize(keys, pos =3):  
    ax = [i for i in keys]
    temp = [i for i in keys]
    temp.sort()
    temp = temp[-pos:]
    for x in temp:
        ax[keys.index(x)] = 1
    for x in range(len(ax)):
        if ax[x] != 1:
            ax[x] = 0
    return ax

def normalizeSD(keys, thre =3):    # Given score array return shortlisted cats in given threshold
    ax = deviation(keys)
    ax = dev_shortlist(ax,thre)
    return ax
def deviation(array):
    mu = max(array)
    l = len(array)
    ar = []
    for x in range(l):
        ar.append(math.sqrt((array[x]-mu)**2)/l)
    total = sum(ar)
    for x in range(l):
        if total != 0:
            ar[x] = (ar[x]/total)*100
    return ar

def mean_deviation(array):
    l = len(array)
    mu = sum(array)/l
    ar = []
    for x in range(l):
        ar.append(math.sqrt((array[x]-mu)**2)/l)
    total = sum(ar)
    for x in range(l):
        if total != 0:
            ar[x] = (ar[x]/total)*100
    return ar

def dev_shortlist(dev_array,thre = 2):  # Shortlist using threshold from deviation array | return array in 1/0
    final_cat = [0]*len(dev_array)
    for i in range(len(dev_array)):
        if dev_array[i] <=thre:
            final_cat[i] = 1
    return final_cat

def compute1(caption,category,top =3):
    ar = []
    score = []

    # Code to get frequency distribution and unique keywords array
    keywords = []
    caption_freq = []
    counts = Counter(caption)
    if len(counts) > 0:
        labels, values = zip(*counts.items())
        ## sort your values in descending order
        indSort = np.argsort(values)[::-1]
        ## rearrange your data
        keywords = np.array(labels)[indSort]  # Label
        caption_freq = np.array(values)[indSort]  # Values
    
    # Detect words not in Google Dict | Put freq = 0
    for x in keywords:
        try:
            restConst = w.similarity(x,'something')
        except KeyError:
            caption_freq[np.where(keywords == x)] = 0
        
    #Google similaity function
    for x in category:
        empty = []
        for y in keywords:
            try:
                empty.append(w.similarity(x,y))
            except:
                empty.append(0)
        ar.append(empty)
    
    # Store the similarity values in dataframe
    frame = pd.DataFrame()
    frame = pd.DataFrame(ar, columns = keywords)
  
    #Normalize | top select
    for key in frame.columns:
        frame[key] = normalizeSD(frame[key].tolist(),top)
    
    # Multiply with frequency
    for row in range(len(frame)):
        frame.values[row] = [i*j for i,j in zip(frame.values[row],caption_freq)]
    # Sum the values => Score
    for row in range(len(frame)):
        score.append(sum(frame.values[row]))
    
    frame['category'] = category
    frame['Scores'] = score
    return frame,keywords[:20]

# compute() => category[] to be called outside
def compute(caption,category_list,category,top =3):
    ar = []
    score = []

    # Code to get frequency distribution and unique keywords array
    keywords = []
    caption_freq = []
    counts = Counter(caption)
    if len(counts) > 0:
        labels, values = zip(*counts.items())
        ## sort your values in descending order
        indSort = np.argsort(values)[::-1]
        ## rearrange your data
        keywords = np.array(labels)[indSort]  # Label
        caption_freq = np.array(values)[indSort]  # Values
    
    # Detect words not in Google Dict | Put freq = 0
    for x in keywords:
        try:
            restConst = w.similarity(x,'something')
        except KeyError:
            caption_freq[np.where(keywords == x)] = 0
        
    #Google similaity function
    for c_tag in range(len(category_list)):
        empty1 = []
        for the_word in keywords:
            empty2 = []
            for k_tag in range(len(category_list[c_tag])):
                try:
                    empty2.append(w.similarity(category_list[c_tag][k_tag],the_word))
                except:
                    empty2.append(0)
            empty1.append(sum(empty2)/len(empty2))
        ar.append(empty1)
    # Store the similarity values in dataframe
    frame = pd.DataFrame()
    frame = pd.DataFrame(ar, columns = keywords)
  
    #Normalize | top select
    for key in frame.columns:
        frame[key] = normalizeSD(frame[key].tolist(),top)
    
    # Multiply with frequency
    for row in range(len(frame)):
        frame.values[row] = [i*j for i,j in zip(frame.values[row],caption_freq)]
    # Sum the values => Score
    for row in range(len(frame)):
        score.append(sum(frame.values[row]))
    
    frame['category'] = category
    frame['Scores'] = score
    return frame,keywords[:20]

def get_row_pscore(col_name,f1,i,f2,genuinity_score,top_keywords, scoreType):  # f1-mainframe | f2-frame
    ud = f1.loc[i,'id']
    ul = f1.loc[i,'url']
    row_in_array = [ud,ul]
    score_array = f2[scoreType].tolist()
    empty_score = [0]*(len(col_name)-4-len(score_array))
    score_array.extend(empty_score)
    row_in_array.extend(score_array)
    row_in_array.append(genuinity_score)
    row_in_array.append(str([top_keywords]))
    zip_it = zip(col_name,row_in_array)
    convert_to_dict = dict(zip_it)
    return convert_to_dict

# To make data in 
# DB format | API post 
def to_dict_api(percentages,categories,top_keywords,frame,i): #frame and i to get id
    mydict = {}
    cat_array =[]
    empty_percent = [0]*(len(categories)-len(percentages))
    percent_array = [y for y in percentages]
    percent_array.extend(empty_percent)
    mydict['user_id'] = frame.loc[i,'id']
    mydict['keywords'] = json.dumps(top_keywords.tolist())
    for j in range(len(categories)):
        cat_array.append({'tag':categories[j],'percentage':percent_array[j]})
    mydict['categories'] = json.dumps(cat_array)
    return mydict

In [15]:
tic = time.perf_counter()
x = requests.get('http://44.229.68.155/insta_users/get_uncategorized_accounts?limit=10&current_id=15000', headers={'Authorization': 'Token ruor7REQi9KJz6wIQKDXvwtt'})
status = x.status_code
data = x.json()
df = pd.DataFrame(data['users'])
pages = 0
idsdone = 0
txt = "Done {} pages, the last_id is {} and time taken {} seconds"

while(len(data['users']) !=0 and pages<1):
    try:
        new_tic = time.perf_counter()
        if(status != 200):
            raise Exception("GET request error: {}".format(status))
        dfnew = pd.DataFrame(columns=['id','handle','name','url','gender','country','captions','bio'], data = df[['id','handle','name','url','gender','country','captions','bio']].values)
        last_id = dfnew['id'].iloc[-1]
        # Fresh dataframe
        profile_percentages =  pd.DataFrame(columns = col_name)
        
        # Main Categorization # 
        for i in range(len(dfnew)):

            try:
                #Store userid | caption | total posts
                userid = dfnew['id'].iloc[i]
                captions = dfnew['captions'].iloc[i]
                bio = dfnew['bio'].iloc[i]
                total_posts = len(captions)
                
                # Words which mostly occurs in insta post and we want to avoid considering them for the sake of accuracy of results
                avoidwords = ['verified','none','follow','like']

                #Converting to keywords
                captions,genuinity_score = process(captions,avoidwords)
                caption_array = captions[0]
                bio,ignore_this_var = process(bio,avoidwords)
                bio_array = bio[0]
                # Punishing accounts which has less than 3 words in caption
                if len(caption_array) < 3*(total_posts):
                    raise Exception("Too less words for categorization")

                #Temporary array i-> interim
                icaption_array = [z for z in caption_array]
                ibio_array = [z for z in bio_array]
                # Removing words not in dictionary also single characters
                discarded_words = []
                discarded_bio_words = []
                for x in caption_array:
                    try:
                        checkword = w.similarity(x,'something') #Check word if exist in googlenews
                        if len(x) <2: #Removing single character
                            icaption_array.pop(icaption_array.index(x))
                    except KeyError:
                        discarded_words.append(icaption_array.pop(icaption_array.index(x)))
                for x in bio_array:
                    try:
                        checkword = w.similarity(x,'something') #Check word if exist in googlenews
                        if len(x) <2: #Removing single character
                            ibio_array.pop(ibio_array.index(x))
                    except KeyError:
                        discarded_bio_words.append(ibio_array.pop(ibio_array.index(x)))
                #Restore Array
                caption_array = [z for z in icaption_array]
                bio_array = [z for z in ibio_array]
                # Check similarity in discarded words
                discard_word_scores = [0]*len(categories)
                discard_bio_word_scores = [0]*len(categories)
                for x_word in discarded_words:
                    for the_category in categories:
                        if the_category in x_word:
                            discard_word_scores[categories.index(the_category)] = 1 + discard_word_scores[categories.index(the_category)]
                for x_word in discarded_bio_words:
                    for the_category in categories:
                        if the_category in x_word:
                            discard_bio_word_scores[categories.index(the_category)] = 1 + discard_bio_word_scores[categories.index(the_category)]           

                if len(caption_array) ==0:
                    raise Exception("No Words in profile for categorization or Different language")

                # Word2vec computation
                frame = pd.DataFrame()
                frame, top_keywords = compute(caption_array,categories,categories_list,3)
                # Word2vec for bio
                frame_bio = pd.DataFrame()
                frame_bio,ignore_keywords = compute(bio_array,categories,categories_list,2)
                # Storing bio scores
                bio_score = frame_bio['Scores'].tolist()
                for sc in range(len(bio_score)):
                    bio_score[sc] = bio_score[sc] + discard_bio_word_scores[sc] 

                # Add up computed score with discarded score
                score_column = frame['Scores'].tolist()
                for ind in range(len(score_column)):
                    score_column[ind] = score_column[ind] + discard_word_scores[ind]

                # Add weighted score of bio in main score
                normalize_bio_score = normalizeSD(bio_score,3)
                for sc in range(len(normalize_bio_score)):
                    if normalize_bio_score[sc] == 1:
                        score_column[sc] = score_column[sc]*2
                frame['Scores'] = score_column

                #Convert to Percentage
                per = frame['Scores'].tolist()
                per_sum = sum(per)
                for x in range(len(per)):
                    temp_number = (float)(per[x])
                    per[x] = round((temp_number/per_sum)*100)
                frame['Percentage'] = per


                #Store profile percentage
                row_df_5 = get_row_pscore(col_name,dfnew,i,frame,genuinity_score,top_keywords,'Percentage')
                profile_percentages = profile_percentages.append(row_df_5,ignore_index=True)

                # POST API Request
#                 file = to_dict_api(frame['Percentage'].tolist(),API_categories,top_keywords,dfnew,i)
    #             url = 'http://44.229.68.155/insta_user/add_category_to_insta_user'
    #             y = requests.post(url, data = file,headers={'Authorization': 'Token ruor7REQi9KJz6wIQKDXvwtt'})

    #             if y.status_code !=200:
    #                 raise Exception("Post request error {}".format(y.status_code))

                print("ID no. {} Done! Total {} ids done".format(userid,idsdone))
                idsdone = idsdone +1

            except Exception as Argument:
                # creating/opening a file 
                f = open(r"E:\Winkl Mains\Task_4_Recategorisation\DATA\errorfile.txt", "a") 
                # writing in the file 
                f.write("Userid\t"+str(userid)+"\t: "+str(Argument)+str("\n")) 
                # closing the file 
                f.close()  

        # END of Main Categorization #

        pages = pages +1
        profile_percentages.to_csv(r'E:\Winkl Mains\Task_4_Recategorisation\DATA\Testing_book.csv',mode='a',header=False,index =False)
#         display(profile_percentages)
        toc = time.perf_counter()
        print(txt.format(pages,last_id,toc-new_tic))
        # Request new page
        x = requests.get('http://44.229.68.155/insta_users/get_uncategorized_accounts?limit=10&current_id='+str(last_id), headers={'Authorization': 'Token ruor7REQi9KJz6wIQKDXvwtt'})
        data = x.json()
        df = pd.DataFrame(data['users'])
        status = x.status_code
    
    except Exception as Argument:
        # creating/opening a file 
        f = open(r"E:\Winkl Mains\Task_4_Recategorisation\DATA\errorfile.txt", "a") 
        # writing in the file 
        f.write("Currently in "+str(pages)+"\t"+str(Argument)+str("\n")) 
        # closing the file 
        f.close()  
    
    

toc = time.perf_counter()
f = open(r"E:\Winkl Mains\Task_4_Recategorisation\DATA\errorfile.txt", "a") 
# writing in the file 
f.write("The model ran in "+str(toc - tic)+" seconds"+str("\n")) 
f.write("Total ids done: "+str(idsdone))
# closing the file 
f.close() 
display(profile_percentages)


ID no. 15004 Done! Total 0 ids done
ID no. 15011 Done! Total 1 ids done
ID no. 15017 Done! Total 2 ids done
ID no. 15022 Done! Total 3 ids done
ID no. 15025 Done! Total 4 ids done
ID no. 15026 Done! Total 5 ids done
ID no. 15030 Done! Total 6 ids done
ID no. 15033 Done! Total 7 ids done
ID no. 15041 Done! Total 8 ids done
ID no. 15063 Done! Total 9 ids done
Done 1 pages, the last_id is 15063 and time taken 40.29780820000042 seconds


Unnamed: 0,user_id,url,Food,Fashion,Makeup,Beauty,Lifestyle,Luxury,Travel,Photography,...,Finance,Education,Animal/Pet,Health,Self Improvement,Art,Parenting,Books,genuinity_score,top keywords
0,15004,https://www.instagram.com/miss__vashisht_/,9,2,6,5,3,24,1,4,...,11,3,0,3,5,0,0,0,73.521916,"[array(['skin', 'beauty', 'insta', 'makeup', '..."
1,15011,https://www.instagram.com/oklastbite/,24,1,2,4,4,22,1,3,...,5,3,1,2,4,0,0,0,81.665531,"[array(['food', 'pune', 'one', 'blogpost', 'bl..."
2,15017,https://www.instagram.com/sanjanasar/,10,2,5,7,5,27,2,2,...,6,4,1,4,6,0,0,0,74.766141,"[array(['love', 'life', 'photography', 'dance'..."
3,15022,https://www.instagram.com/foodstalkker/,16,1,3,5,5,27,2,3,...,5,4,1,2,4,0,0,0,51.22509,"[array(['food', 'blogger', 'mumbai', 'bahrain'..."
4,15025,https://www.instagram.com/heyitsdelhi/,37,2,1,3,4,18,1,2,...,5,3,0,1,4,0,0,0,47.799072,"[array(['food', 'delhi', 'foodie', 'updates', ..."
5,15026,https://www.instagram.com/sarangrai/,9,1,3,5,5,32,2,2,...,6,4,2,3,6,0,0,0,79.816788,"[array(['dance', 'love', 'video', 'dancers', '..."
6,15030,https://www.instagram.com/imsahilbrown/,18,2,4,5,5,25,2,3,...,5,3,1,3,4,0,0,0,63.091483,"[array(['inspirational', 'love', 'positivity',..."
7,15033,https://www.instagram.com/jaipur__blogger/,19,1,2,4,4,19,1,3,...,12,4,0,2,4,0,0,0,87.909411,"[array(['jaipur', 'blogger', 'food', 'daily', ..."
8,15041,https://www.instagram.com/officialgauravkothari/,11,2,2,4,5,27,2,2,...,9,3,0,1,6,0,0,0,52.425019,"[array(['mumbai', 'india', 'photography', 'sho..."
9,15063,https://www.instagram.com/ritika_shyam/,16,3,10,6,5,26,1,1,...,5,3,0,2,5,0,0,0,66.323907,"[array(['chandigarh', 'happy', 'love', 'lifest..."


In [16]:
profile_percentages.to_csv(r'E:\Winkl Mains\Task_4_Recategorisation\DATA\testing_df.csv')

## Testing function from random accounts

In [7]:
dfnew = pd.read_csv(r'E:\Winkl Mains\Task_4_Recategorisation\DATA\RandomInfluencers.csv')
profile_percentages =  pd.DataFrame(columns = col_name)
for i in range(len(dfnew)):

    try:
        #Store userid | caption | total posts
        userid = dfnew['id'].iloc[i]
        captions = dfnew['captions'].iloc[i]
        bio = dfnew['bio'].iloc[i]
        total_posts = len(captions)

        # Words which mostly occurs in insta post and we want to avoid considering them for the sake of accuracy of results
        avoidwords = ['verified','none','follow','like']

        #Converting to keywords
        captions,genuinity_score = process(captions,avoidwords)
        caption_array = captions[0]
        bio,ignore_this_var = process(bio,avoidwords)
        bio_array = bio[0]
        # Punishing accounts which has less than 3 words in caption
#         print(len(caption_array),2*(total_posts))
#         if len(caption_array) < 2*(total_posts):
#             raise Exception("Too less words for categorization")

        #Temporary array i-> interim
        icaption_array = [z for z in caption_array]
        ibio_array = [z for z in bio_array]
        # Removing words not in dictionary also single characters
        discarded_words = []
        discarded_bio_words = []
        for x in caption_array:
            try:
                checkword = w.similarity(x,'something') #Check word if exist in googlenews
                if len(x) <2: #Removing single character
                    icaption_array.pop(icaption_array.index(x))
            except KeyError:
                discarded_words.append(icaption_array.pop(icaption_array.index(x)))
        for x in bio_array:
            try:
                checkword = w.similarity(x,'something') #Check word if exist in googlenews
                if len(x) <2: #Removing single character
                    ibio_array.pop(ibio_array.index(x))
            except KeyError:
                discarded_bio_words.append(ibio_array.pop(ibio_array.index(x)))
        #Restore Array
        caption_array = [z for z in icaption_array]
        bio_array = [z for z in ibio_array]
        # Check similarity in discarded words
        discard_word_scores = [0]*len(categories)
        discard_bio_word_scores = [0]*len(categories)
        for x_word in discarded_words:
            for the_category in categories:
                if the_category in x_word:
                    discard_word_scores[categories.index(the_category)] = 1 + discard_word_scores[categories.index(the_category)]
        for x_word in discarded_bio_words:
            for the_category in categories:
                if the_category in x_word:
                    discard_bio_word_scores[categories.index(the_category)] = 1 + discard_bio_word_scores[categories.index(the_category)]           

        if len(caption_array) ==0:
            raise Exception("No Words in profile for categorization or Different language")

        # Word2vec computation
        frame = pd.DataFrame()
        frame, top_keywords = compute1(caption_array,categories,3) #compute(caption_array,categories,categories_list,3)
        # Word2vec for bio
        frame_bio = pd.DataFrame()
        frame_bio,ignore_keywords = compute1(bio_array,categories,2)
        # Storing bio scores
        bio_score = frame_bio['Scores'].tolist()
        for sc in range(len(bio_score)):
            bio_score[sc] = bio_score[sc] + discard_bio_word_scores[sc] 

        # Add up computed score with discarded score
        score_column = frame['Scores'].tolist()
        for ind in range(len(score_column)):
            score_column[ind] = score_column[ind] + discard_word_scores[ind]

        # Add weighted score of bio in main score
        normalize_bio_score = normalizeSD(bio_score,3)
        for sc in range(len(normalize_bio_score)):
            if normalize_bio_score[sc] == 1:
                score_column[sc] = score_column[sc]*2
        frame['Scores'] = score_column

        #Convert to Percentage
        per = frame['Scores'].tolist()
        per_sum = sum(per)
        for x in range(len(per)):
            temp_number = (float)(per[x])
            per[x] = round((temp_number/per_sum)*100)
        frame['Percentage'] = per


        #Store profile percentage
        row_df_5 = get_row_pscore(col_name,dfnew,i,frame,genuinity_score,top_keywords,'Percentage')
        profile_percentages = profile_percentages.append(row_df_5,ignore_index=True)

        # POST API Request
#                 file = to_dict_api(frame['Percentage'].tolist(),API_categories,top_keywords,dfnew,i)
#             url = 'http://44.229.68.155/insta_user/add_category_to_insta_user'
#             y = requests.post(url, data = file,headers={'Authorization': 'Token ruor7REQi9KJz6wIQKDXvwtt'})

#             if y.status_code !=200:
#                 raise Exception("Post request error {}".format(y.status_code))

        print("ID no. {} Done! Total {} ids done".format(userid,idsdone))
        idsdone = idsdone +1

    except Exception as Argument:
        # creating/opening a file 
        f = open(r"E:\Winkl Mains\Task_4_Recategorisation\DATA\errorfile.txt", "a") 
        # writing in the file 
        f.write("Userid\t"+str(userid)+"\t: "+str(Argument)+str("\n")) 
        # closing the file 
        f.close()
        
        
profile_percentages.to_csv(r"E:\Winkl Mains\Task_4_Recategorisation\DATA\Random_account_results_old.csv")