tenir a la carpeta: influencers.csv, recognition-output.json, trend-items.csv


In [29]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from datetime import date 

In [1]:
def getGeneralDataframe(jsonPath, csvPath):
  """ Funcion to obtain a dataFrame with the data of the Json file
      Parameters: json path """

  csvdata = pd.read_csv(csvPath)
  csvdata = csvdata.set_index("username")

  with open(jsonPath) as f:
    data = json.load(f)

  #Create general data frame
  df = pd.DataFrame(columns=[ "gender","user", "followers","likes", "score","item0", "item1","date" ])

  #Put the data in the data frame
  for i in range(0,len(data['output'])):
    df.loc[i] = [data['output'][i]["gender"],data['output'][i]["user"], data['output'][i]["followers"],data['output'][i]["likes"], csvdata.loc[data["output"][i]["user"]].score, data['output'][i]["item0"], data['output'][i]["item1"],data['output'][i]["date"]]

  #replace empty strig for 0
  df = df.replace(r'^\s*$', 0, regex=True)

  #Change the type of followers and likes variables (String to int)
  df['followers']=df['followers'].astype(int)
  df['likes']=df['likes'].astype(int)

  return df

In [2]:
def getFilteredDataframe(df,gender):
  """ Funcion to obtain a dataFrame filtering the general dataframe by a specific gender
      Parameters: Dataframe, gender (writen in the same format that the json gender parameter) """
  df1 = df[df["gender"] == gender]
  return df1

In [3]:
def getHistogram(df,itemCol,title):
  """ Funcion to obtain a Histogram of the items 
      Parameters: Dataframe, name of the column that contains the item. Example 'item0', title of the histogram """

  #Discard the NA
  df = df[df[itemCol] != "N/A N/A"]

  df[itemCol].value_counts().plot(kind='bar')
  plt.title(title)
  plt.xlabel('Item')
  plt.ylabel('Quantity')
  plt.show()

In [4]:
def getPossibleTrend(df,item):
  """ 
  Function to predict the items that may become trends (discard the top 40% of the histogram)
  Parameters: Dataframe, name of the column that contains the item. Example 'item0' """
  
  df_item = pd.DataFrame(columns=["item","counts"])

  #Discard the NA
  df = df[df[item] != "N/A N/A"]
  
  j=0

  # Get the item and the item number of apparitions
  for i in range(0, len(df[item].value_counts())):
    df_item.loc[j]=df[item].value_counts().index.tolist()[i],df[item].value_counts()[i]
    j=j+1
  
 
  # Discard the top 40% of the histogram
  start = (round(len(df[item].value_counts())*0.4))
  v = [0] * len(df[item].value_counts())

  for i in range(start, len(df[item].value_counts())):
    v[i]=1;

  # Add a new column to the dataframe to control if the item is in the top 40% of the histogram
  df_item['valid'] = v

  return df_item

In [9]:
if __name__ == "__main__":
  df = getGeneralDataframe("recognition-output.json", "influencers.csv")
  df_w= getFilteredDataframe(df,"WOMAN")
  df_m= getFilteredDataframe(df,"MAN")

  '''getHistogram(df_w,'item0',"Woman top items histogram")
  getHistogram(df_w,'item1',"Woman bottom items histogram")
  getHistogram(df_m,'item0',"Man top items histogram")
  getHistogram(df_m,'item1',"Man bottom items histogram")'''

  df_item0_w = getPossibleTrend(df_w,'item0')
  df_item1_w = getPossibleTrend(df_w,'item1')
  df_item0_m = getPossibleTrend(df_m,'item0')
  df_item1_m = getPossibleTrend(df_m,'item1')
  
 

In [10]:
df_w


Unnamed: 0,gender,user,followers,likes,score,item0,item1,date
0,WOMAN,raquelreitx,514246,24643,0.5,vestido_de_chaleco negro,N/A N/A,07/11/2021
1,WOMAN,raquelreitx,514246,23158,0.5,N/A N/A,N/A N/A,02/11/2021
2,WOMAN,mariapombo,2236175,56418,0.5,N/A N/A,N/A N/A,03/11/2021
3,WOMAN,mariapombo,2236175,39770,0.5,N/A N/A,N/A N/A,31/10/2021
4,WOMAN,mariapombo,2236175,50204,0.5,jersey gris,pantalones gris,30/10/2021
...,...,...,...,...,...,...,...,...
165,WOMAN,nadia.correiia,8226,140,0.5,N/A N/A,N/A N/A,31/10/2021
166,WOMAN,nadia.correiia,8226,314,0.5,N/A N/A,N/A N/A,27/10/2021
167,WOMAN,nadia.correiia,8226,113,0.5,N/A N/A,N/A N/A,27/10/2021
168,WOMAN,heytammi,10489,161,0.5,jersey marron,N/A N/A,07/11/2021


In [11]:
df_item0_w

Unnamed: 0,item,counts,valid
0,jersey negro,17,0
1,abrigo_manga_larga negro,15,0
2,abrigo_manga_larga gris,15,0
3,jersey gris,14,0
4,abrigo_manga_larga marron,12,0
5,jersey verde oscuro,9,0
6,jersey marron,8,0
7,abrigo_manga_larga verde oscuro,4,1
8,jersey beige,3,1
9,jersey rosa,2,1


In [49]:

df1 = df_item0_m.set_index("item")
df2 = df_item1_m.set_index("item")
df3 = df_item0_w.set_index("item")
df4 = df_item1_w.set_index("item")

dfTrends = pd.read_csv("trend-items.csv")
dfRecord = pd.read_csv("items-record.csv")
length = len(dfRecord)

dfAux = dfTrends
today = date.today().strftime("%m/%d/%y")

percentatge1 = (df1.loc[dfTrends["prendas"][0]].counts/sum(df1["counts"]))*100
percentatge2 = (df2.loc[dfTrends["prendas"][1]].counts/sum(df1["counts"]))*100
percentatge3 = (df3.loc[dfTrends["prendas"][2]].counts/sum(df1["counts"]))*100
percentatge4 = "{:.2f}".format((df4.loc[dfTrends["prendas"][3]].counts/sum(df1["counts"]))*100)

percentatge = [percentatge1, percentatge2, percentatge3, percentatge4]
dfAux[today] = percentatge



dfAux.to_csv (r'/Users/JoanVargas/Documents/GitHub/PAE_Accenture/DataProcessing/extra/items-record.csv', index = False, header=True)









#dfRecord
  #  [data['output'][i]["gender"],data['output'][i]["user"], data['output'][i]["followers"],data['output'][i]["likes"], csvdata.loc[data["output"][i]["user"]].score, data['output'][i]["item0"], data['output'][i]["item1"],data['output'][i]["date"]]

#emplenar el dataframe de dfREcord amb una setmana cada vegada que llegeixi.
#emplenar la setmana anterior amb el valor.  

  
  #df.to_csv('file_name.csv')

  

In [41]:
a = 3.12314
"{:.2f}".format(a)

'3.12'