In [1]:
def clean_text(df, col_name, new_col_name):
    # column values to lower case
    df[new_col_name] = df[col_name].str.lower().str.strip()
    # removes special characters
    df[new_col_name] = df[new_col_name].apply(lambda x: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z.% \t])", "", x))
    return df

In [1]:
def group_selection(df, product):
    """
    Por a single product, identifies the products to which is related (all products in "candidate" column), and
    remove duplicated rows. The ratio is also included to keep the knowledge of the similarity ratio that let 
    them into the group.
    
    Inputs:
    - df: dataframe with threshold 80, with all matches and links (for all products)
    - product: the name of the product to work with
    
    Output: single product similarity group dataframe
    """
    # we select all the rows that match the product
    df_temp = df[df['product_name'] == product].copy()
    # remove duplicates: to see all the products to which a product is similar
    group = df_temp.drop_duplicates(subset=['product_name', 'candidate']).reset_index(drop=True)
    return group

In [2]:
def create_product_df(groups_df, product, product_group_list):
    
    if groups_df.shape[0] == 0:
        group_id = 0
    else:
        group_id = groups_df['group_id'].max() + 1
    df_temp = pd.DataFrame({
        'group_id': group_id,
        'leader': product,
        'member': product_group_list
        })
    return df_temp

In [3]:
def create_track_temp_df(track_df, applicants_list):
    if track_df.shape[0] == 0:
        group_id = 0
    else:
        group_id = track_df['group_id'].max() + 1
    df_temp = pd.DataFrame({
        'group_id': group_id,
        'member': applicants_list
        })
    return df_temp

In [4]:
def group_concat_procedure(groups_df, track_df, product_, applicants_list):
    # verify if any of the applicants is already assigned to a group, if not:
    if track_df[track_df['member'].isin(applicants_list)].shape[0] == 0:
        # create df for the group
        concat_df = create_product_df(groups_df, product_, applicants_list)
        # concat to the global groups df
        groups_df = pd.concat([groups_df, concat_df], axis=0).reset_index(drop=True)
        # create track applicants df
        df_track_temp = create_track_temp_df(track_df, applicants_list)
        # concat new group to track groups df
        track_df = pd.concat([track_df, df_track_temp], axis=0).reset_index(drop=True)
    else:
        # get the group ids where any of the candidates is assigned
        group_ids_list = list(track_df[track_df['member'].isin(applicants_list)]['group_id'].unique())
        # locate where the group is
        select_df = groups_df[groups_df['group_id'].isin(group_ids_list)]
        # list of actual members of the group
        already_members = list(pd.unique(select_df[['leader', 'member']].values.ravel('K')))
        # union of already members + apliccants list --> idea: get a unique selection of a wider spectrum
        concatenated_list = list(set(already_members + applicants_list))
        # remove group from global groups dataframe
        groups_df = groups_df[~groups_df['group_id'].isin(group_ids_list)].copy()
        # remove group from track groups dataframe
        track_df = track_df[~track_df['group_id'].isin(group_ids_list)]
        # re-create and add the modified group to the global groups df
        concat_df = create_product_df(groups_df, product_, concatenated_list)
        groups_df = pd.concat([groups_df, concat_df], axis=0).reset_index(drop=True)
        # re-create and add the modified group to the track groups df
        df_track_temp = create_track_temp_df(track_df, applicants_list)
        track_df = pd.concat([track_df, df_track_temp], axis=0).reset_index(drop=True)
    return groups_df, track_df