In [97]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from datetime import date

def getGeneralDataframe(jsonPath, csvPath):

  """ Funcion to obtain a dataFrame with the data of the Json file
      Parameters: json path """

  csvdata = pd.read_csv(csvPath)
  csvdata = csvdata.set_index("username")

  with open(jsonPath) as f:
    data = json.load(f)

  #Create general data frame
  df = pd.DataFrame(columns=["gender", "user", "followers", "likes", "score", "item0", "item1", "date"])

  #Put data in the data frame
  for i in range(0,len(data['output'])):
    df.loc[i] = [data['output'][i]["gender"],data['output'][i]["user"], data['output'][i]["followers"],data['output'][i]["likes"], csvdata.loc[data["output"][i]["user"]].score, data['output'][i]["item0"], data['output'][i]["item1"],data['output'][i]["date"]]

  #Drop rows which no value in likes from json
  df['likes'].replace('', np.nan, inplace=True)
  df.dropna(subset=['likes'], inplace=True)
  #Index reset 
  df.reset_index(inplace=True, drop=True)
  
  #Change the type of followers and likes variables (String to int)
  df['followers']=df['followers'].astype(int)
  df['likes']=df['likes'].astype(int)
  df['date']=pd.to_datetime(df['date'])
  
  return df


In [98]:
def getFilteredDataframe(df,gender):
  """ Funcion to obtain a dataFrame filtering the general dataframe by a specific gender
      Parameters: Dataframe, gender (writen in the same format that the json gender parameter) """
  df1 = df[df["gender"] == gender]
  df1.reset_index(inplace=True, drop=True)
  
  return df1

In [99]:
def getPossibleTrend(df):
      
  #Create new colum in panda series format in order to calculate difference in days
  #today = pd.to_datetime("today")
  #df['diff_days']= (today - df['date']).dt.days + 1
  

  df['post_engagement'] = (df['likes'] / df['followers'])*100
  #* (37 - df['diff_days'])

  m0, m1 ,m2, m3, m4, m5 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
  total_avg0, total_avg1, total_avg2, total_avg3, total_avg4, total_avg5 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
  

  #Sum of post_engagement by intervals according to #followers
  for i in range(0, len(df['followers'])):

    if(0 <= df['followers'][i] < 5000):
      total_avg0 = df['post_engagement'][i] + total_avg0
      m0 = m0 + 1
    
    elif (5000 <= df['followers'][i] < 20000):
      total_avg1 = df['post_engagement'][i] + total_avg1
      m1 = m1 + 1

    elif (20000 <= df['followers'][i] < 50000):
      total_avg2 = df['post_engagement'][i] + total_avg2
      m2 = m2 + 1

    elif (50000 <= df['followers'][i] < 200000):
      total_avg3 = df['post_engagement'][i] + total_avg3
      m3 = m3 + 1  

    elif (200000 <= df['followers'][i] < 500000):
      total_avg4 = df['post_engagement'][i] + total_avg4
      m4 = m4 + 1 

    elif (df['followers'][i] > 500000):
      total_avg5 = df['post_engagement'][i] + total_avg5
      m5 = m5 + 1

  '''print(m0,m1,m2,m3,m4,m5)
  print(total_avg0,total_avg1,total_avg2,total_avg3,total_avg4,total_avg5)
  print(total_avg0/m0,total_avg1/m1,total_avg2/m2,total_avg3/m3,total_avg4/m4,total_avg5/m5)'''


  
  #Column with interval_average_posts_engagement
  df['avg_interval']=0.0
  df['value']=0.0
  j=0
  for i in range(0, len(df['followers'])):
    if((0 <= df['followers'][i] < 5000) and (m0 > 0)):
      #Put interval_average_posts_engagement in 'avg_interval' column
      df.loc[j,'avg_interval']= total_avg0/m0
      #Compare post_engagement with their interval_average_posts_engagement and weight the interval
      df.loc[j,'value']=(df['post_engagement'][i] - df['avg_interval'][i])*0.5
      j=j+1

    elif((5000 <= df['followers'][i] < 20000) and (m1 > 0)):
      df.loc[j,'avg_interval'] = total_avg1/m1
      df.loc[j,'value']=(df['post_engagement'][i] - df['avg_interval'][i])*0.5
      j=j+1

    elif((20000 <= df['followers'][i] < 50000) and (m2 > 0)):
      df.loc[j,'avg_interval'] = total_avg2/m2
      df.loc[j,'value']=(df['post_engagement'][i] - df['avg_interval'][i])*0.5
      j=j+1

    elif((50000 <= df['followers'][i] < 200000) and (m3 > 0)):
      df.loc[j,'avg_interval'] = total_avg3/m3
      df.loc[j,'value']=(df['post_engagement'][i] - df['avg_interval'][i])*0.5
      j=j+1

    elif((200000 <= df['followers'][i] < 500000) and (m4 > 0)):
      df.loc[j,'avg_interval'] = total_avg4/m4
      df.loc[j,'value']=(df['post_engagement'][i] - df['avg_interval'][i])*0.5
      j=j+1

    elif ((df['followers'][i] > 500000) and (m5 > 0)):
      df.loc[j,'avg_interval'] = total_avg5/m5
      df.loc[j,'value']=(df['post_engagement'][i] - df['avg_interval'][i])*0.5
      j=j+1

  return df

In [100]:
def labelsDF_finalscore(df, item):
  """ Parameters: Dataframe, name ofº the column that contains the item. Example 'item0' """
  
  #New dataframe only with labels and their counts manipulated a posteriori including 'valid' and 'final_score' columns
  df_item = pd.DataFrame(columns=["item","counts"])

  #Discard the NA
  df_w_NA = df[df[item] != "N/A N/A"]
  
  j=0
  # Get the item and the item number of apparitions
  for i in range(0, len(df_w_NA[item].value_counts())):
    df_item.loc[j]=df_w_NA[item].value_counts().index.tolist()[i], df_w_NA[item].value_counts()[i]
    j=j+1
  
  # Discard the top 30% of the histogram
  start = (round(len(df_w_NA[item].value_counts())*0.3))
  v = [0] * len(df_w_NA[item].value_counts())

  for i in range(start, len(df_w_NA[item].value_counts())):
    v[i]=1;
  # Add a new column to the dataframe to control if the item is in the top 30% of the histogram
  df_item['valid'] = v

  #Sum all the 'values' ​​for each label to get its final score
  df_item['final_score'] = 0.0
  for i in range (0, len(df_item)):
      for j in range (0, len(df)):
          if(df_item['item'][i] == df['item0'][j] or df_item['item'][i] == df['item1'][j]):
              df_item.loc[i,'final_score'] = df_item['final_score'][i] + df['value'][j]
              
  #Cancel those labels that are not in the 30%
  df_item['final_score'] = (df_item['final_score'] * df_item['valid'])
  df_item['final_score'] = df_item['final_score']
    
  return df_item

In [101]:
def getwinner(df_labels):
    #Return the item with maximum final score
    max_score = 0.0
    for i in range (0, len(df_labels)):
        if (df_labels['final_score'][i] >= max_score):
            max_score = df_labels['final_score'][i]
            winner = df_labels['item'][i]
    return winner

In [102]:
if __name__ == "__main__":
  postsDF = getGeneralDataframe("recognition-output.json", "influencers.csv")
  
  #Separate general df by gender
  df_w= getFilteredDataframe(postsDF,"WOMAN").copy()
  df_m= getFilteredDataframe(postsDF,"MAN").copy()

  #Return dataframe with column 'value' generated with (post engagements - intervals average) * intervals weight
  possible_w = getPossibleTrend(df_w)
  possible_m = getPossibleTrend(df_m)

  #Dataframe with item counts and valid (1) or not (0) and final_score (sum of the values of a certain item)
  finalTopLabelsDF_w = labelsDF_finalscore(possible_w,'item0')
  finalBottomLabelsDF_w = labelsDF_finalscore(possible_w,'item1')
  finalTopLabelsDF_m = labelsDF_finalscore(possible_m,'item0')
  finalBottomLabelsDF_m = labelsDF_finalscore(possible_m,'item1')

  #Df with final_score (sum of the values of a certain item)
  '''finalTopLabelsDF_w = finalscore(topLabelsDF_w, possible_w)
  finalBottomLabelsDF_w = finalscore(bottomLabelsDF_w,possible_w)
  finalTopLabelsDF_m = finalscore(topLabelsDF_m,possible_m)
  finalBottomLabelsDF_m = finalscore(bottomLabelsDF_m,possible_m)'''

  winnerTop_w = getwinner(finalTopLabelsDF_w)
  winnerBottom_w = getwinner(finalBottomLabelsDF_w)
  winnerTop_m = getwinner(finalTopLabelsDF_m)
  winnerBottom_m = getwinner(finalBottomLabelsDF_m)

  print(winnerTop_w, winnerBottom_w, winnerTop_m, winnerBottom_m)

jersey verde oscuro pantalon_corto verde oscuro abrigo_manga_larga naranja pantalones azul marino


In [106]:
finalBottomLabelsDF_w

Unnamed: 0,item,counts,valid,final_score
0,pantalones negro,25,0,-0.0
1,pantalones gris,23,0,0.0
2,pantalones verde oscuro,7,0,-0.0
3,pantalones marron,5,0,-0.0
4,falda verde oscuro,3,0,0.0
5,pantalones caqui,3,0,-0.0
6,pantalon_corto verde oscuro,2,1,1.598459
7,pantalon_corto negro,2,1,0.051621
8,falda negro,2,1,1.032923
9,pantalones beige,2,1,-0.721096


In [104]:
import csv

header = ['prendas', 'genero']

data = [
    [winnerTop_m, 'hombre'],
    [winnerBottom_m, 'hombre'],
    [winnerTop_w, 'mujer'],
    [winnerBottom_w, 'mujer'],
]

with open('trend-items.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write multiple rows
    writer.writerows(data)

In [105]:

#postsDF
#topLabelsDF
#bottomLabelsDF
#df_w
#df_m
#print(possible_w)
#pd.set_option('display.max_rows', None)
#possible_m
finalTopLabelsDF_m


Unnamed: 0,item,counts,valid,final_score
0,abrigo_manga_larga negro,20,0,-0.0
1,jersey gris,14,0,0.0
2,jersey negro,13,0,0.0
3,abrigo_manga_larga verde oscuro,8,0,-0.0
4,abrigo_manga_larga gris,8,0,-0.0
5,abrigo_manga_larga marron,7,0,-0.0
6,jersey marron,6,0,0.0
7,jersey beige,4,0,-0.0
8,jersey blanco,4,1,0.825186
9,camiseta_manga_corta negro,4,1,-0.454657
