In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import nltk
import psycopg2
from nltk.corpus import stopwords
import re
from itertools import chain 
import math
import time
import logging 
import requests
import json
print("Imported all packages.")
print("Loading GoogleNews...")
from gensim import models
w = models.KeyedVectors.load_word2vec_format(r"F:\Pretrained Models\GoogleNews-vectors-negative300.bin.gz", binary=True, limit=2100000)
print("Loaded GoogleNews!")

Imported all packages.
Loading GoogleNews...
Loaded GoogleNews!


food_tags = ['food', 'recipes', 'recipe', 'cooking', 'fried']
fashion_tags = ['fashion', 'outfit', 'clothes', 'menswear','wear']
makeup_tags = ['makeup', 'eyeliner', 'bridal', 'shades', 'airbrush']
beauty_tags = ['beauty', 'pimple', 'skin', 'oil', 'hair']
lifestyle_tags = ['lifestyle', 'life', 'class', 'style', 'happy']
luxury_tags = ['luxury', 'chic', 'handbag', 'stylish', 'brand']
travel_tags = ['travel', 'world', 'destination', 'adventure', 'landscapes']
photography_tags = ['photography', 'photo', 'editing', 'creative', 'artist']
fitness_tags = ['fitness', 'nutrition', 'workout', 'healthy', 'exercise']
sports_tags = ['sport', 'sports']
gaming_tags = ['gaming', 'gamer', 'fun', 'games', 'stream']
entertainment_tags = ['entertainment', 'movies', 'series', 'network', 'comedy']
technology_tags = ['technology', 'tech', 'geek', 'smartphones', 'mobiles']
investment_tags = ['investment', 'financial', 'stocks', 'market', 'trade']
education_tags = ['education', 'lectures', 'competitive', 'exams', 'coaching']
animal_tags = ['animal', 'habitat', 'wild', 'documentary', 'nature']
health_tags = ['health', 'medical', 'prevent', 'treat', 'heal']
self_improvement_tags = ['psychology', 'motivation', 'inspire', 'happiness', 'mind']

In [2]:
#Frame pre-processing function
def process(array,avoidwords):
    hashes = len(re.findall(r'[#,@]',str(array))) #counting hashtags
    text = re.sub(r'\[[0-9]*\]',' ',str(array))  #Remove Numbers
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text) # Remove nums
    text = re.sub(r'\s+',' ',text)  #Remove extra space
    text = re.sub(r"[^a-zA-Z0-9]+",' ',text)  #Remove special characters
    text = text.lower()  #Lower case all
    text = nltk.sent_tokenize(text)  #Tokenize to sentences 
    keywords = [nltk.word_tokenize(sentence) for sentence in text]
    raw_cap = len(keywords[0]) # Total number of words in caption
    stop_words = stopwords.words('english')
    stop_words.extend(avoidwords)
    for i in range(len(keywords)):
        keywords[i] = [word for word in keywords[i] if word not in stop_words]
    genuinity_percent = (raw_cap-hashes)*100/raw_cap
    return keywords,genuinity_percent


# normalize() -> given an array, converts to 1/0, top int(pos) will be 1
def normalize(keys, pos =3):  
    ax = [i for i in keys]
    temp = [i for i in keys]
    temp.sort()
    temp = temp[-pos:]
    for x in temp:
        ax[keys.index(x)] = 1
    for x in range(len(ax)):
        if ax[x] != 1:
            ax[x] = 0
    return ax

def normalizeSD(keys, thre =3):    # Given score array return shortlisted cats in given threshold
    if sum(keys) == 0:
        return keys
    ax = deviation(keys)
    ax = dev_shortlist(ax,thre)
    return ax
def deviation(array):
    mu = max(array)
    l = len(array)
    ar = []
    for x in range(l):
        ar.append(math.sqrt((array[x]-mu)**2)/l)
    total = sum(ar)
    for x in range(l):
        if total != 0:
            ar[x] = (ar[x]/total)*100
    return ar

def mean_deviation(array):
    l = len(array)
    mu = sum(array)/l
    ar = []
    for x in range(l):
        ar.append(math.sqrt((array[x]-mu)**2)/l)
    total = sum(ar)
    for x in range(l):
        if total != 0:
            ar[x] = (ar[x]/total)*100
    return ar

def dev_shortlist(dev_array,thre = 2):  # Shortlist using threshold from deviation array | return array in 1/0
    final_cat = [0]*len(dev_array)
    for i in range(len(dev_array)):
        if dev_array[i] <=thre:
            final_cat[i] = 1
    return final_cat
def compute1(caption,category,top =3):
    ar = []
    score = []

    # Code to get frequency distribution and unique keywords array
    keywords = []
    caption_freq = []
    counts = Counter(caption)
    if len(counts) > 0:
        labels, values = zip(*counts.items())
        ## sort your values in descending order
        indSort = np.argsort(values)[::-1]
        ## rearrange your data
        keywords = np.array(labels)[indSort]  # Label
        caption_freq = np.array(values)[indSort]  # Values
    
    # Detect words not in Google Dict | Put freq = 0
    for x in keywords:
        try:
            restConst = w.similarity(x,'something')
        except KeyError:
            caption_freq[np.where(keywords == x)] = 0
        
    #Google similaity function
    for x in category:
        empty = []
        for y in keywords:
            try:
                empty.append(w.similarity(x,y))
            except:
                empty.append(0)
        ar.append(empty)
    
    # Store the similarity values in dataframe
    frame = pd.DataFrame()
    frame = pd.DataFrame(ar, columns = keywords)
  
    #Normalize | top select
    for key in frame.columns:
        frame[key] = normalizeSD(frame[key].tolist(),top)
    
    # Multiply with frequency
    for row in range(len(frame)):
        frame.values[row] = [i*j for i,j in zip(frame.values[row],caption_freq)]
    # Sum the values => Score
    for row in range(len(frame)):
        score.append(sum(frame.values[row]))
    
    frame['category'] = category
    frame['Scores'] = score
    return frame,keywords[:20]

# compute() => category[] to be called outside
def compute2(caption,category_list,category,top =3):
    ar = []
    score = []

    # Code to get frequency distribution and unique keywords array
    keywords = []
    caption_freq = []
    counts = Counter(caption)
    if len(counts) > 0:
        labels, values = zip(*counts.items())
        ## sort your values in descending order
        indSort = np.argsort(values)[::-1]
        ## rearrange your data
        keywords = np.array(labels)[indSort]  # Label
        caption_freq = np.array(values)[indSort]  # Values
    
    # Detect words not in Google Dict | Put freq = 0
    for x in keywords:
        try:
            restConst = w.similarity(x,'something')
        except KeyError:
            caption_freq[np.where(keywords == x)] = 0
        
    #Google similaity function
    for c_tag in range(len(category_list)):
        empty1 = []
        for the_word in keywords:
            empty2 = []
            for k_tag in range(len(category_list[c_tag])):
                try:
                    empty2.append(w.similarity(category_list[c_tag][k_tag],the_word))
                except:
                    empty2.append(0)
            empty1.append(max(empty2))
        ar.append(empty1)
    # Store the similarity values in dataframe
    frame = pd.DataFrame()
    frame = pd.DataFrame(ar, columns = keywords)
  
    #Normalize | top select
    for key in frame.columns:
        frame[key] = normalizeSD(frame[key].tolist(),top)
    
    # Multiply with frequency
    for row in range(len(frame)):
        frame.values[row] = [i*j for i,j in zip(frame.values[row],caption_freq)]
    # Sum the values => Score
    for row in range(len(frame)):
        score.append(sum(frame.values[row]))
    
    frame['category'] = category
    frame['Scores'] = score
    return frame,keywords[:20]

def get_row_pscore(col_name,f1,i,f2,genuinity_score,top_keywords, scoreType):  # f1-mainframe | f2-frame
    ud = f1.loc[i,'id']
    ul = f1.loc[i,'url']
    row_in_array = [ud,ul]
    score_array = f2[scoreType].tolist()
    row_in_array.extend(score_array)
    row_in_array.append(genuinity_score)
    row_in_array.append(','.join(map(str,top_keywords)))
    zip_it = zip(col_name,row_in_array)
    convert_to_dict = dict(zip_it)
    return convert_to_dict

In [11]:
dfnew = pd.read_csv(r'E:\Winkl Mains\Task_4_Recategorisation\DATA\test_10_profiles.csv')
display(dfnew)

Unnamed: 0.1,Unnamed: 0,id,handle,name,url,gender,country,captions
0,0,122728,maa_bhukh_lagi_hai,AT,https://www.instagram.com/maa_bhukh_lagi_hai/,Other,India,['भरवा बैंगन\nमसाला भरवा बैंगन और गरमा गरम ghe...
1,1,122737,shuklaneerja,Neerja,https://www.instagram.com/shuklaneerja/,Female,India,"[None, None, 'If you are not able to make padm..."
2,2,122784,thetriballux,Naina Doi,https://www.instagram.com/thetriballux/,Female,India,"['Like mom like daughter!!!', 'Kuch apni juban..."
3,3,122788,tellicherrykitchen,Tellicherry Kitchen,https://www.instagram.com/tellicherrykitchen/,Other,India,['Thank you @iamjeevaa for visiting Tellicherr...
4,4,122798,gourmet_globe_trotter,Gourmet Globe Trotter,https://www.instagram.com/gourmet_globe_trotter/,Male,India,['In Frame - Fried Chane with Poori and Chai(T...
5,5,122800,bindass_jaipur,जयपुर वाले,https://www.instagram.com/bindass_jaipur/,Other,India,['.\nPc:@bhandsena\n📷\n📷\nUse tag |@bindass_ja...
6,6,122809,sugarberryz_cakery,sugarberryz cakery,https://www.instagram.com/sugarberryz_cakery/,Other,India,['Anniversary photo roll cake with gold drips\...
7,7,122837,padharo_mhare_desh,🔵केसरिया बालम पधारो म्हारे देश,https://www.instagram.com/padharo_mhare_desh/,Other,India,['पाली (राजस्थान)।. यह है पाली के जाडन गांव स्...
8,8,122838,cheffed.in,Quick & easy food recipes,https://www.instagram.com/cheffed.in/,Female,India,['Masala foxnuts\n.\n.\n.\nFor 15 minute recip...
9,9,122859,food_stories_by_kj,Komal & Suresh Jaisingh,https://www.instagram.com/food_stories_by_kj/,Other,India,['💟\n. \n. \n#mumbaifoodtrendster #foodiefromb...


In [59]:
avoidwords = ['verified','none','follow','like']
corpus = dfnew.captions[2]
corpus,g_score = process(corpus,avoidwords)
corpus_array = corpus[0]
cp = ['self','spiritual','dress','car','speed','grow','hello']
aframe = pd.DataFrame()
aframe,keyss = compute2(cp,categories_list,categories)
per = aframe['Scores'].tolist()
per_sum = sum(per)
for x in range(len(per)):
    temp_number = (float)(per[x])
    per[x] = round((temp_number/per_sum)*100)
aframe['Percentage'] = per
display(aframe)
print(len(aframe))

Unnamed: 0,hello,grow,speed,car,dress,spiritual,self,category,Scores,Percentage
0,0,0,0,0,0,0,0,food,0,0
1,1,0,0,1,1,0,0,fashion,3,17
2,0,0,0,0,1,0,0,makeup,1,6
3,0,0,0,0,0,0,0,beauty,0,0
4,1,0,0,0,0,0,1,lifestyle,2,11
5,0,0,0,1,0,0,0,luxury,1,6
6,0,0,0,0,0,0,0,travel,0,0
7,0,0,0,0,0,0,0,photography,0,0
8,0,0,1,0,0,0,1,fitness,2,11
9,0,0,1,1,0,0,0,sports,2,11


18


In [51]:
aframe = pd.DataFrame()
aframe,keyss = compute2(cp,categories)
per = aframe['Scores'].tolist()
per_sum = sum(per)
for x in range(len(per)):
    temp_number = (float)(per[x])
    per[x] = round((temp_number/per_sum)*100)
aframe['Percentage'] = per
display(aframe)

Unnamed: 0,spiritual,self,category,Scores,Percentage
0,0,0,food,0,0
1,0,1,fashion,1,11
2,0,0,makeup,0,0
3,1,0,beauty,1,11
4,1,1,lifestyle,2,22
5,0,0,luxury,0,0
6,0,0,travel,0,0
7,0,0,photography,0,0
8,1,1,fitness,2,22
9,0,0,sports,0,0


In [32]:
profile_percentages =  pd.DataFrame(columns = col_name)
f1 = pd.DataFrame(columns = ['id','url'])
f1['id'] = [65783]
f1['url'] = ['http']
def get_row_pscore(col_name,f1,i,f2,genuinity_score,top_keywords, scoreType):  # f1-mainframe | f2-frame
    ud = f1.loc[i,'id']
    ul = f1.loc[i,'url']
    row_in_array = [ud,ul]
    score_array = f2[scoreType].tolist()
#     empty_score = [0]*(len(col_name)-len(score_array))
#     score_array.extend(empty_score)
    row_in_array.extend(score_array)
    row_in_array.append(genuinity_score)
    row_in_array.append(str([top_keywords]))
    zip_it = zip(col_name,row_in_array)
    convert_to_dict = dict(zip_it)
    return convert_to_dict
display(f1)
# display(aframe)
yy = get_row_pscore(col_name,f1,0,frame,genuinity_score,top_keywords.tolist(), 'Percentage')
profile_percentages = profile_percentages.append(yy,ignore_index=True)
print(yy)
display(profile_percentages)

Unnamed: 0,id,url
0,65783,http


{'user_id': 65783, 'url': 'http', 'Food': 42, 'Fashion': 1, 'Makeup': 1, 'Beauty': 3, 'Lifestyle': 5, 'Luxury': 14, 'Travel': 0, 'Photography': 2, 'Fitness': 3, 'Sports': 2, 'Gaming': 2, 'Entertainment': 3, 'Gadgets & Tech': 6, 'Finance': 3, 'Education': 2, 'Animal/Pet': 0, 'Health': 1, 'Self Improvement': 2, 'Art': 1, 'Parenting': 1, 'Books': 6, 'genuinity_score': 33.66054775694513, 'top keywords': "[['homemade', 'foodies', 'rice', 'salad', 'roti', 'taste', 'made', 'food', 'one', 'puri', 'good', 'chutney', 'kadi', 'masala', 'day', 'best', 'try', 'happy', 'fav', 'home']]"}


Unnamed: 0,user_id,url,Food,Fashion,Makeup,Beauty,Lifestyle,Luxury,Travel,Photography,...,Finance,Education,Animal/Pet,Health,Self Improvement,Art,Parenting,Books,genuinity_score,top keywords
0,65783,http,42,1,1,3,5,14,0,2,...,3,2,0,1,2,1,1,6,33.660548,"[['homemade', 'foodies', 'rice', 'salad', 'rot..."


## Testing new compute()

In [20]:
dfnew = pd.read_csv(r'E:\Winkl Mains\Task_4_Recategorisation\DATA\RandomInfluencers.csv')
# dfnew = dfnew[15:20]
display(dfnew)

Unnamed: 0,id,handle,name,url,gender,country,captions,bio
0,210026,bhavinkaklotar_26,Bhavin kaklotar,https://www.instagram.com//bhavinkaklotar_26/,Male,India,['😍😍😍\n.\n#vitmedia\xa0#portrait\xa0#portraito...,Mom💓
1,210028,dailykritisanon,Kriti Sanon 🇬🇧,https://www.instagram.com//dailykritisanon/,Other,India,"[""know your worth never settle for less 😎 // #...","slaying since 1990 ∞ her middle name is ""prett..."
2,210042,rameshwari__dassi,Rameshwari Sahu,https://www.instagram.com//rameshwari__dassi/,Female,India,['This festival bring home the most valuable b...,🙏🙏YoU aRe AmAzing😍 \n💞😘बन्दीछोड़ 😍💞\nBindass gi...
3,210043,darshan_.57,Darshan Tarsariya,https://www.instagram.com//darshan_.57/,Male,India,['#jikadarabhavin .#fashionindia #portraitoffi...,🖤son of prajapati 🤴\n🖤 single boy 💗\n🖤I believ...
4,210049,deepveerkeralafp,👑DEEPVEER👑 | FAN ACCOUNT,https://www.instagram.com//deepveerkeralafp/,Other,India,"['💜✨', 'His heart eyes for her😍💓✨', 'Happy the...",Welcome to the world of Deepveer💑👑\nDP mention...
5,250001,akhil_emoboy,AkHil eMoboy,https://www.instagram.com/akhil_emoboy,Male,India,['Angine palathum paraym emolyf💝💝#emo #keralag...,👉Paid promotion only 👈Modeling 💓ex tiktoker🎸🎸m...
6,250022,arushipatkeyy,Arushi patkey|Content Creator,https://www.instagram.com/arushipatkeyy,Female,India,"[""🎅🏻MEGA Xmas GIVEAWAY!🎅🏻\nMerry spirits flow ...",Stop making sense .Logic is predictable.\n THI...
7,250029,thatbloggernextdoor,Esha Wasey,https://www.instagram.com/thatbloggernextdoor,Female,India,['Winters are here so is my new skincare routi...,"Hyd, India\n✉️ esha.tbnd@gmail.com\nFounder: @..."
8,250040,himadri.official,HIMADRI,https://www.instagram.com/himadri.official,Female,India,"['🍁', '❄️', 'Winging it since forever♾✨ \n#sel...",Life’s too short to maintain an aesthetic feed...
9,250043,yasqueer_,Avantika⚡,https://www.instagram.com/yasqueer_,Female,India,"['🐶', '👉👈', 'Real Thug With Real Pug.', 'Miff ...",🏳️‍🌈\nFollow my bairn @sonofabitch55\n📍MFP | BBSR


In [17]:
x = requests.get('http://44.229.68.155/insta_users/get_uncategorized_accounts?limit=20&current_id=136587', headers={'Authorization': 'Token ruor7REQi9KJz6wIQKDXvwtt'})
status = x.status_code
data = x.json()
df = pd.DataFrame(data['users'])
dfnew = pd.DataFrame()
dfnew = pd.DataFrame(columns=['id','handle','name','url','gender','country','captions','bio'], data = df[['id','handle','name','url','gender','country','captions','bio']].values)
display(dfnew)
caps = dfnew.captions.iloc[0]
print(type(caps))
print(len(caps))
# print(caps)

Unnamed: 0,id,handle,name,url,gender,country,captions,bio
0,136599,divyankatripathidahiya,Divyanka Tripathi Dahiya,https://www.instagram.com/divyankatripathidahiya/,Female,India,"[#Welcome2021, Thanks for dragging me out of t...",An Artist
1,136621,varunverrma,Varun Verma,https://www.instagram.com/varunverrma/,Male,India,[I'm staying #Harpalfashionable with @amazonfa...,Fashion | lifestyle | travel\nMr India suprana...
2,136625,mekahairandmakeup,MEKA HAIR AND MAKEUP 🇦🇺,https://www.instagram.com/mekahairandmakeup/,Female,India,"[, Hair by #mekahairandmakeup \n#curleyhair #c...",BRIDAL | NON BRIDAL | HAIR/MAKEUP | SAREE DRAP...
3,136629,happywedding.lifeen,happywedding.life en,https://www.instagram.com/happywedding.lifeen/,Other,India,"[""We keep this love in a photograph""❤️❤️😍😍📷 .....",Happy Wedding.life is the Wedding Portal to fi...
4,136665,jasminhope_,JASMIN,https://www.instagram.com/jasminhope_/,Female,India,[#ad Rocking around the Christmas tree with my...,#Beauty. #Fashion. Home #blogger 💫🎄\nMy shop -...
5,136684,bikewithgirl,Priyanka Kochhar,https://www.instagram.com/bikewithgirl/,Female,India,[Spa Day with the brother @yomody be like...🧼🛁...,Just a girl that loves motorcycles and fast ca...
6,136689,nashi_cappuccino,Nashi,https://www.instagram.com/nashi_cappuccino/,Male,India,[ഒറ്റക്കൊമ്പൻ 🐘 😁#komban #shootonphone #elepha...,Founder @cappuccino_creatives \n 📽 FILMER...
7,136692,oursindia,Oursindia,https://www.instagram.com/oursindia/,Other,India,[The Tale of the Mountains !..________________...,Use - #oursindia Or #b_ind
8,136697,navaneeth_unnikrishnan,Navaneeth Unnikrishnan,https://www.instagram.com/navaneeth_unnikrishnan/,Male,India,"[Filtered Sun, Munnar, Kerala - The game of li...",Fragments of my Imagination\nAmbassador @sonya...
9,136701,shafilegacy,shafi legacy,https://www.instagram.com/shafilegacy/,Male,India,[#throwback #ride #life #calm #feel #mood #min...,𝙸 𝚆𝙰𝚂 𝙱𝙾𝚁𝙽 𝚃𝙾 𝙱𝙴 𝚁𝙴𝙰𝙻 𝙽𝙾𝚃 𝚃𝙾 𝙱𝙴 𝙿𝙴𝚁𝙵𝙴𝙲𝚃 🤘


<class 'list'>
289


In [18]:
categories = ['food', 'fashion', 'makeup', 'beauty', 'lifestyle','luxury', 'travel','photography','fitness','sports','gaming', 'entertainment', 'technology','investment','education', 'animal', 'health', 'inspiration','art','parenting','book']
categories_list = [['food', 'recipe', 'cooking'],
              ['fashion', 'outfit', 'clothes'],
              ['makeup','shades','haircare','face'],
              ['beauty','skin', 'oil', 'hair'],
              ['lifestyle','style'],
              ['luxury','rich','billionaire','car'],
              ['travel', 'world', 'destination', 'adventures', 'landscapes','bucket'],
              ['photography', 'photo', 'editing'],
              ['fitness', 'nutrition', 'workout', 'healthy', 'exercise','run'],
              ['sport', 'sports','win','loss'],
              ['gaming','stream','freefire','pubg'],
              ['entertainment', 'movies', 'series', 'film', 'comedy','actor','actress'],
              ['technology', 'tech', 'geek', 'smartphones', 'mobiles'],
              ['investment', 'financial', 'stocks', 'market', 'trade'],
              ['education', 'lectures', 'competitive', 'exams', 'coaching'],
              ['animal', 'wild', 'wildlife', 'nature','pet'],
              ['health', 'medical', 'prevention','cure', 'treatment', 'heal'],
              ['psychology', 'motivation', 'inspiration', 'mind','spiritual'],
              ['sketch','DIY','painting','art','drawing'],
              ['children','kids','infant','family','parenting','toys'],
              ['books','review','journaling','stationery','study']]
col_name = ['user_id','url','Food','Fashion', 'Makeup', 'Beauty', 'Lifestyle','Luxury', 'Travel','Photography','Fitness','Sports','Gaming', 'Entertainment', 'Gadgets & Tech','Finance','Education', 'Animal/Pet', 'Health', 'Self Improvement','Art', 'Parenting', 'Books', 'genuinity_score','top keywords']
API_categories = ['Food','Fashion', 'Makeup', 'Beauty', 'Lifestyle','Luxury', 'Travel','Photography','Fitness','Sports','Gaming', 'Entertainment', 'Gadgets & Tech','Finance','Education', 'Animal/Pet', 'Health','Self Improvement','Art', 'Parenting', 'Books']

In [19]:
profile_percentages =  pd.DataFrame(columns = col_name)
idsdone =0
for i in range(len(dfnew)):

    try:
        #Store userid | caption | total posts
        userid = dfnew['id'].iloc[i]
        captions = dfnew['captions'].iloc[i]
        bio = dfnew['bio'].iloc[i]
        total_posts = len(captions)

        # Words which mostly occurs in insta post and we want to avoid considering them for the sake of accuracy of results
        avoidwords = ['verified','none','follow','like','reposted','influencer','gmail','com','collabs','collaboration']

        #Converting to keywords
        captions,genuinity_score = process(captions,avoidwords)
        caption_array = captions[0]
        bio,ignore_this_var = process(bio,avoidwords)
        bio_array = bio[0]
        # Punishing accounts which has less than 3 words in caption
        print(len(caption_array),5*(total_posts))
        if len(caption_array) < 5*(total_posts):
            raise Exception("Too less words for categorization")

        #Temporary array i-> interim
        icaption_array = [z for z in caption_array]
        ibio_array = [z for z in bio_array]
        # Removing words not in dictionary also single characters
        discarded_words = []
        discarded_bio_words = []
        for x in caption_array:
            try:
                checkword = w.similarity(x,'something') #Check word if exist in googlenews
                if len(x) <=2: #Removing single character
                    icaption_array.pop(icaption_array.index(x))
            except KeyError:
                discarded_words.append(icaption_array.pop(icaption_array.index(x)))
        for x in bio_array:
            try:
                checkword = w.similarity(x,'something') #Check word if exist in googlenews
                if len(x) <=2: #Removing single character
                    ibio_array.pop(ibio_array.index(x))
            except KeyError:
                discarded_bio_words.append(ibio_array.pop(ibio_array.index(x)))
        #Restore Array
        caption_array = [z for z in icaption_array]
        bio_array = [z for z in ibio_array]
        # Check similarity in discarded words
        discard_word_scores = [0]*len(categories)
        discard_bio_word_scores = [0]*len(categories)
        for x_word in discarded_words:
            for category_index in range(len(categories_list)):
                for the_category in categories_list[category_index]:
                    if the_category in x_word:
                        discard_word_scores[category_index] = 1 + discard_word_scores[category_index]
        for x_word in discarded_bio_words:
            for category_index in range(len(categories_list)):
                for the_category in categories_list[category_index]:
                    if the_category in x_word:
                        discard_bio_word_scores[category_index] = 1 + discard_bio_word_scores[category_index]           
        if len(caption_array) ==0:
            raise Exception("No Words in profile for categorization or Different language")

        # Word2vec computation
        frame = pd.DataFrame()
        frame, top_keywords = compute2(caption_array,categories_list,categories,3) ##############
        # Word2vec for bio
        frame_bio = pd.DataFrame()
#         frame_bio,ignore_keywords = compute2(bio_array,categories_list,categories,2) ###################
#         # Storing bio scores
#         bio_score = frame_bio['Scores'].tolist()
        # Compute() for bio corpus
        bio_score = [0]*len(categories)
        for x_word in bio_array:
            for category_index in range(len(categories_list)):
                for the_category in categories_list[category_index]:
                    if the_category in x_word:
                        bio_score[category_index] = 1 + bio_score[category_index]
    
        for sc in range(len(bio_score)):
            bio_score[sc] = bio_score[sc] + discard_bio_word_scores[sc] 

        # Add up computed score with discarded score
        score_column = frame['Scores'].tolist()
        for ind in range(len(score_column)):
            score_column[ind] = score_column[ind] + discard_word_scores[ind]

        # Add weighted score of bio in main score
        normalize_bio_score = normalizeSD(bio_score,3)
#         print("bio score: ",bio_score)
#         print("bio norm: ",normalize_bio_score)
        for sc in range(len(normalize_bio_score)):
            if normalize_bio_score[sc] == 1:
                score_column[sc] = score_column[sc]*2
        frame['Scores'] = score_column

        #Convert to Percentage
        per = frame['Scores'].tolist()
        per_sum = sum(per)
        for x in range(len(per)):
            temp_number = (float)(per[x])
            per[x] = round((temp_number/per_sum)*100)
        frame['Percentage'] = per


        #Store profile percentage
        row_df_5 = get_row_pscore(col_name,dfnew,i,frame,genuinity_score,top_keywords.tolist(),'Percentage')
        profile_percentages = profile_percentages.append(row_df_5,ignore_index=True)

        # POST API Request
#                 file = to_dict_api(frame['Percentage'].tolist(),API_categories,top_keywords,dfnew,i)
#             url = 'http://44.229.68.155/insta_user/add_category_to_insta_user'
#             y = requests.post(url, data = file,headers={'Authorization': 'Token ruor7REQi9KJz6wIQKDXvwtt'})

#             if y.status_code !=200:
#                 raise Exception("Post request error {}".format(y.status_code))

        print("ID no. {} Done! Total {} ids done".format(userid,idsdone))
        idsdone = idsdone +1
        
    except Exception as Argument:
        # creating/opening a file 
        f = open(r"E:\Winkl Mains\Task_4_Recategorisation\DATA\errorfile.txt", "a") 
        # writing in the file 
        f.write("Userid\t"+str(userid)+"\t: "+str(Argument)+str("\n")) 
        # closing the file 
        f.close()
        

display(profile_percentages)
profile_percentages.to_csv(r"E:\Winkl Mains\Task_4_Recategorisation\DATA\Random_account_improve4.csv")

3757 1445
ID no. 136599 Done! Total 0 ids done
4949 1695
ID no. 136621 Done! Total 1 ids done
3574 755
ID no. 136625 Done! Total 2 ids done
5006 540
ID no. 136629 Done! Total 3 ids done
5459 1050
ID no. 136665 Done! Total 4 ids done
12782 1580
ID no. 136684 Done! Total 5 ids done
1433 740
ID no. 136689 Done! Total 6 ids done
6192 540
ID no. 136692 Done! Total 7 ids done
7794 1200
ID no. 136697 Done! Total 8 ids done
8512 1460
ID no. 136716 Done! Total 9 ids done
9801 980
ID no. 136722 Done! Total 10 ids done
5998 1380
ID no. 136733 Done! Total 11 ids done
1538 545
ID no. 136737 Done! Total 12 ids done
10317 1560
ID no. 136743 Done! Total 13 ids done
7486 1325
ID no. 136746 Done! Total 14 ids done
6104 1260
ID no. 136752 Done! Total 15 ids done
9116 985
ID no. 136770 Done! Total 16 ids done
12155 1310
ID no. 136774 Done! Total 17 ids done
2812 810
ID no. 136786 Done! Total 18 ids done


Unnamed: 0,user_id,url,Food,Fashion,Makeup,Beauty,Lifestyle,Luxury,Travel,Photography,...,Finance,Education,Animal/Pet,Health,Self Improvement,Art,Parenting,Books,genuinity_score,top keywords
0,136599,https://www.instagram.com/divyankatripathidahiya/,2,3,6,5,3,4,7,2,...,2,4,4,5,10,7,7,4,81.241851,"love,life,one,beautiful,best,hair,women,outfit..."
1,136621,https://www.instagram.com/varunverrma/,2,8,5,3,6,3,12,3,...,2,4,3,3,7,4,6,4,77.982795,"photography,fashion,link,get,day,love,bio,indi..."
2,136625,https://www.instagram.com/mekahairandmakeup/,1,6,28,24,3,1,6,3,...,0,1,1,1,3,6,3,3,16.900676,"makeup,hair,beautiful,mua,saree,melbourne,phot..."
3,136629,https://www.instagram.com/happywedding.lifeen/,3,5,4,6,5,2,21,2,...,4,2,3,2,6,5,9,6,43.44815,"wedding,life,image,https,credit,get,inspired,w..."
4,136665,https://www.instagram.com/jasminhope_/,3,8,6,11,3,4,6,1,...,2,3,4,4,7,4,7,5,80.485175,"new,love,get,perfect,christmas,dress,gifted,al..."
5,136684,https://www.instagram.com/bikewithgirl/,1,3,4,3,3,15,6,2,...,1,3,4,3,7,4,7,3,62.017341,"bikes,motorcycle,india,superbike,bike,racing,b..."
6,136689,https://www.instagram.com/nashi_cappuccino/,2,4,4,4,3,5,7,3,...,1,4,4,2,7,4,6,4,40.608543,"cappuccino,polo,part,day,creatives,kerala,irfa..."
7,136692,https://www.instagram.com/oursindia/,1,2,3,3,2,3,12,14,...,2,3,4,2,6,7,5,5,64.945557,"artist,india,pictures,videos,gallery,get,shari..."
8,136697,https://www.instagram.com/navaneeth_unnikrishnan/,1,2,6,5,3,5,13,5,...,2,3,7,3,6,6,4,4,72.018266,"shot,landscape,travel,photography,image,icelan..."
9,136716,https://www.instagram.com/rehanponcha/,2,2,4,3,3,3,14,1,...,2,4,3,4,16,4,6,3,86.8949,"love,fitness,one,time,sport,travel,watch,golf,..."


In [16]:
wrds = ['psychology', 'motivation', 'inspire', 'mind','spiritual']
for i in categories_list[17]:
    print(i,w.similarity(i,'happy'))

psychology 0.017473541
motivation 0.23326564
inspiration 0.14630651
mind 0.198841
spiritual 0.095140815
