## STRUCTURE
- Process initially found solutions
    - Get initial solutions processing from SQL
    - Extract the data in a structured way
    - write results to sql
- Cluster solutions
    - Pivot solutions data (melt)
    - Get embeddings for each solution
    - Cluster the solutions based on embedings
- Generate product improvements
    - get product data
    - pass product data and clustered solutions to ai, get answers
    - process resulting dictionaries
    - save results

In [3]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
import json
import os

import tiktoken
from openai.embeddings_utils import get_embedding
from sklearn.cluster import AgglomerativeClustering

import openai
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if os.getenv("OPENAI_API_KEY") is not None:
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

# Create an SQLAlchemy engine to connect to the database
engine = create_engine('postgresql://postgres:mysecretpassword@localhost/postgres')

# Read the ASIN values from the CSV file
asin_list_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/external/asin_list.csv'
#asin_list_path = './data/external/asin_list.csv'
asin_list = pd.read_csv(asin_list_path)['asin'].tolist()

OPENAI_API_KEY is ready


#### Process initially found solutions
    - Get initial solutions processing from SQL
    - Extract the data in a structured way

In [4]:
query = f"""
    SELECT DISTINCT asin, cluster_label, type, solutions
    FROM weighted_trait_graph 
    WHERE asin IN ({','.join(['%s']*len(asin_list))});
"""

# convert the list to a tuple and put it inside a dictionary
params = {'asin_list': tuple(asin_list)}

weighted_trait_df_graph =  pd.read_sql_query(query, engine, params=asin_list)

In [5]:
for i in weighted_trait_df_graph.index:
    try:
        json_string = weighted_trait_df_graph.solutions.iloc[i]
        if json_string:
            data = json.loads(json_string)
            if isinstance(data, dict):
                # Access 'Problem Statement' key
                problem_statement = data.get('Problem Statement')
                if problem_statement:
                    weighted_trait_df_graph.at[i, 'cluster_problem_statement'] = problem_statement
                
                # Access 'Solution 1' key
                solution_1 = data.get('Solution 1')
                if solution_1 and isinstance(solution_1, dict):
                    solution_1_title = solution_1.get('Title')
                    solution_1_description = solution_1.get('Description')
                    weighted_trait_df_graph.at[i, 'cluster_solution_1_title'] = solution_1_title
                    weighted_trait_df_graph.at[i, 'cluster_solution_1_description'] = solution_1_description
                
                # Access 'Solution 2' key
                solution_2 = data.get('Solution 2')
                if solution_2 and isinstance(solution_2, dict):
                    solution_2_title = solution_2.get('Title')
                    solution_2_description = solution_2.get('Description')
                    weighted_trait_df_graph.at[i, 'cluster_solution_2_title'] = solution_2_title
                    weighted_trait_df_graph.at[i, 'cluster_solution_2_description'] = solution_2_description
                
                # Access 'Solution 3' key
                solution_3 = data.get('Solution 3')
                if solution_3 and isinstance(solution_3, dict):
                    solution_3_title = solution_3.get('Title')
                    solution_3_description = solution_3.get('Description')
                    weighted_trait_df_graph.at[i, 'cluster_solution_3_title'] = solution_3_title
                    weighted_trait_df_graph.at[i, 'cluster_solution_3_description'] = solution_3_description

    except (json.JSONDecodeError, ValueError) as e:
        # Handle JSONDecodeError and ValueError
        # Perform necessary actions or set default values
        print(f"Error processing row {i}: {e}")


Error processing row 2: Expecting ',' delimiter: line 3 column 1 (char 512)
Error processing row 20: Expecting ',' delimiter: line 3 column 1 (char 512)
Error processing row 40: Expecting ',' delimiter: line 3 column 1 (char 512)
Error processing row 51: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error processing row 64: Expecting ',' delimiter: line 3 column 1 (char 512)
Error processing row 81: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error processing row 103: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error processing row 108: Expecting ',' delimiter: line 3 column 1 (char 512)
Error processing row 115: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error processing row 119: Expecting ',' delimiter: line 3 column 1 (char 512)
Error processing row 124: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error processing row 133: Exp

#### Process initially found solutions
    - write results to sql

In [5]:
#for column in ['cluster_problem_statement', 'cluster_solution_1_title', 'cluster_solution_1_description', 'cluster_solution_2_title', 'cluster_solution_2_description', 'cluster_solution_3_title', 'cluster_solution_3_description']:
for column in ['cluster_problem_statement']:
    with engine.connect() as con:
        # Add cluster_label column if it doesn't exist
        con.execute(f"ALTER TABLE weighted_trait_graph ADD COLUMN IF NOT EXISTS {column} VARCHAR;")

        for index, row in weighted_trait_df_graph.iterrows():
            asin_val = row['asin']
            cluster_label_val = row['cluster_label']
            type_val = row['type']
            column_val = row[column]

            query = f"""
                UPDATE weighted_trait_graph
                SET {column} = '{str(column_val).replace("'", "''")}'
                WHERE type = '{type_val}'
                    AND cluster_label = '{cluster_label_val}'
                    AND asin = '{asin_val}';
                """
            
            try:
                con.execute(query)
            except Exception as e:
                print(f"An error occurred during the execution of the query:\n{query}\nError message: {str(e)}")


#### Cluster solutions
    - Pivot solutions data (melt)

In [6]:
## Generates a expanded dataframe dedicated for the solutions
cluster_solution_df = weighted_trait_df_graph.copy()

# Perform first melt
melted_df_title = pd.melt(cluster_solution_df,
                          id_vars=['asin', 'cluster_label', 'type', 'cluster_problem_statement'],
                          value_vars=['cluster_solution_1_title', 'cluster_solution_2_title', 'cluster_solution_3_title'],
                          var_name='solution_number', 
                          value_name='cluster_solution_title')

melted_df_title.drop(columns = 'solution_number', inplace = True)
melted_df_title.drop_duplicates(inplace = True)

# Perform second melt
melted_df_description = pd.melt(cluster_solution_df,
                                id_vars=['asin', 'cluster_label', 'type', 'cluster_problem_statement'],
                                value_vars=['cluster_solution_1_description', 'cluster_solution_2_description', 'cluster_solution_3_description'],
                                var_name='solution_number', 
                                value_name='cluster_solution_description')

melted_df_description.drop(columns = 'solution_number', inplace = True)
melted_df_description.drop_duplicates(inplace = True)
melted_df_description

# Merge the two dataframes
cluster_solutions_df = pd.merge(melted_df_title, melted_df_description, 
                     on=['asin', 'cluster_label', 'type', 'cluster_problem_statement'], how='inner')

del melted_df_title, melted_df_description

cluster_solutions_df.drop_duplicates(inplace = True)

#### Cluster solutions
    - Get embeddings for each solution
    - Cluster the solutions based on embedings

In [7]:

embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191
encoding = tiktoken.get_encoding(embedding_encoding)
    

def get_text_from_embedding(embedding):
    return openai.Embedding.retrieve(embedding, model="text-embedding-ada-002")["data"][0]["text"]

In [8]:
n_clusters = 10

df = cluster_solutions_df.copy()
df.drop(columns = ['asin'], inplace = True)

df['cluster_solution_title']= df['cluster_solution_title'].map(str)
df['cluster_solution_description']= df['cluster_solution_description'].map(str)
# omit observations that are too long to embed
df["n_tokens"] = df['cluster_solution_description'].apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens]

In [9]:
df.drop_duplicates(subset='cluster_solution_description', inplace=True, keep='first')

In [10]:
# Get embeddings
df["embedding"] = df['cluster_solution_description'].apply(lambda x: get_embedding(x, engine=embedding_model))
df["embedding"] = df["embedding"].apply(np.array)  # convert string to numpy array
matrix = np.vstack(df.embedding.values)

In [11]:
# Fit clusters
n_clusters = n_clusters  # Adjust as needed
clustering = AgglomerativeClustering(n_clusters=n_clusters)
labels = clustering.fit_predict(matrix)

# Add cluster labels to dataframe and create clusters dictionary
df["cluster"] = labels
clusters_dict = {}
for i in range(n_clusters):
    clusters_dict[i] = df[df.cluster == i]['cluster_solution_title'].values.tolist()

df.sort_values(by=['cluster'], inplace=True)

#### Generate product improvements
    - get product data
    - pass product data and clustered solutions to ai, get answers

In [13]:
# Read data about the product
json_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/processed/summarised_simplified_product_information.json'
with open(json_path) as file:
    json_string = file.read()
    general_product_data = json.loads(json_string)

In [31]:
User_Prompt_1 = f"""\
You are a highly experienced industrial product design engineer.\
You are asked to review a series of solutions proposed by a team of junior engineers.\
The solutions are for improving an existing product.\
Expected Output means two changes that can be made to the product and is technical. \
It will be implemented by an engineer so it has to be as precise as posible. No bla bla.\
Simple improvements are better than complex ones. Consider the cost of implementing the improvement and opt for cheaper solutions.\
Only the best 2 improvements, not more!\
\
{general_product_data}
\
Observations Type: 'Fact',\
Cluster Label: 'Drawing Board with Pen',\
\
Output will be a JSON file with the following structure:
```{{\
"Product Improvement 1":{{\
"Title": ...\
"Implementation Details for the engineer": ...[500 words]\
}},\
"Product Improvement 2":{{\
"Title": ...\
"Implementation Details for the engineer": ...[500 words]\
}}}}\
```PROBLEMS OBSERVED BY THE JUNIOR ENGINEERS: ```['The product, a car toy for kids, has been reported to be noisy, \
which has led to user dissatisfaction, negatively impacting the overall perceived value and marketability of the product.', \
"The product, a magnetic drawing board set, has received mixed reviews from customers. While some appreciate the sensory features, \
others have reported issues with the product's loudness, magnetic functionality, and hole pattern limitations. \
These problems have led to user dissatisfaction, negatively impacting the overall perceived value and marketability of the product."]\
\
```PROPOSED SOLUTIONS: ```["Adding a volume control feature could let users adjust noise levels to their preference. \
This feature needs to be user-friendly and easily accessible. The product's packaging and marketing materials should \
highlight this feature, but a feasibility analysis considering costs is necessary.",\
"The product's noise level could be lessened by reviewing and altering its internal components. \
Noise-dampening materials like foam or rubber could be used. Highlighting this improvement in \
packaging and marketing materials could increase perceived value and marketability.",\
"To address noise and ball mechanism efficiency, a quieter ball like foam, stronger magnets, \
and a warning label about noise could be implemented. Including a noise reduction rating on the \
packaging could also inform users of the noise level, enhancing product value."]
"""

In [33]:
AI_Prompt_1 = """\
{\
  "Product Improvement 1": {\
    "Title": "Volume Control Mechanism for Noise Reduction",\
    "Implementation Details for the engineer": {\
      "Objective": "Reduce the noise produced by the movement of the magnetic beads.",\
      "Solution": [\
        "Incorporate a volume control feature that allows users to adjust the noise levels.",\
        "Use noise-dampening materials like foam or rubber around the inside surface of the board."\
      ],\
      "Steps": [\
        "Volume Control: Add a control knob on the side of the board that modifies the interaction between the stylus and the magnetized beads, thus controlling the noise level.",\
        "Noise-Dampening Materials: Include foam or rubber materials inside the board to lessen the impact noise made by the beads."\
      ],\
      "Considerations": [\
        "Conduct a feasibility analysis for cost-effective implementation.",\
        "Ensure the feature is user-friendly and highlighted in packaging and marketing materials."\
      ]\
    }\
  },\
  "Product Improvement 2": {\
    "Title": "Enhanced Magnetic Functionality and Hole Pattern Variety",\
    "Implementation Details for the engineer": {\
      "Objective": "Improve the drawing capabilities of the board by enhancing magnetic functionality and diversifying the hole pattern.",\
      "Solution": [\
        "Redesign the hole pattern on the board to allow for more creative drawings.",\
        "Use stronger magnets in the stylus pen and the board for a smoother drawing experience."\
      ],\
      "Steps": [\
        "Hole Pattern: Employ CAD software to create a variety of hole patterns. Use CNC machining or injection molding for manufacturing.",\
        "Stronger Magnets: Source and test different types of magnets to determine the most effective and affordable options."\
      ],\
      "Considerations": [\
        "Add a warning label to inform users of the enhanced magnetic strength.",\
        "Ensure the product meets safety standards and regulations regarding the use of magnets in children's toys.",\
        "Balance cost implications with product functionality and safety."\
      ]\
    }\
  }\
}\
"""

In [35]:
import time

for cluster in df.cluster.unique():
    print(cluster)

    User_Prompt_2 = f"""\
        ```PROBLEMS OBSERVED BY THE JUNIOR ENGINEERS: ```{df[df.cluster == cluster].cluster_problem_statement.to_list()}\
        ```PROPOSED SOLUTIONS: ```{df[df.cluster == cluster].cluster_solution_description.to_list()}\
        """

    for i in range(5):  # Retry up to 3 times
        try:
            response = openai.ChatCompletion.create(
                        model="gpt-3.5-turbo",
                        messages=[
                            {"role": "user", "content": User_Prompt_1},
                            {"role": "assistant", "content": AI_Prompt_1},
                            {"role": "user", "content": User_Prompt_2} ],
                        temperature=0.2,
                        api_key=OPENAI_API_KEY
            )
            chatbot_response = response["choices"][0]["message"]["content"]
            df.loc[df.cluster == cluster, "senior_engineer_solution"] = chatbot_response
            print(chatbot_response)
            break  # If the API call is successful, break the loop
        except Exception as e:
            print(f"An error occurred during the OpenAI ChatCompletion API call: {e}")
            if i < 2:  # If this was the first or second attempt, wait for a minute before retrying
                time.sleep(60)
            else:  # If this was the third attempt, don't retry
                print("Failed after 3 attempts. Moving on to the next cluster.")


0
1
{"Product Improvement 1": {"Title": "Expanded Hole Pattern Options", "Implementation Details for the engineer": {"Objective": "To address the issue of limited hole pattern options and enhance the product's appeal to users with specific sensory needs.", "Solution": ["Review the product's design to identify the feasibility of expanding the hole pattern options.", "Create multiple hole pattern options or a customizable hole pattern design.", "Update the product's packaging and marketing materials to highlight the expanded hole pattern options as a key feature."], "Steps": ["Conduct a thorough analysis to determine the feasibility of this solution based on user needs, manufacturing feasibility, and cost implications.", "Employ CAD software to create multiple hole pattern options or a customizable hole pattern design.", "Use CNC machining or injection molding for manufacturing."], "Considerations": ["Ensure that the expanded hole pattern options align with the target market's needs and 

In [36]:
interim_save_output_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/interim/output.csv'
df.to_csv(interim_save_output_path, index=False)

#### Generate product improvements
    - process resulting dictionaries
    - save results

In [18]:
interim_save_output_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/interim/output.csv'
df = pd.read_csv(interim_save_output_path)

In [37]:
df.head(3)

Unnamed: 0,cluster_label,type,cluster_problem_statement,cluster_solution_title,cluster_solution_description,n_tokens,embedding,cluster,senior_engineer_solution,improvement_1_title,...,improvement_5_title,improvement_5_objective,improvement_5_solution,improvement_5_steps,improvement_5_considerations,improvement_6_title,improvement_6_objective,improvement_6_solution,improvement_6_steps,improvement_6_considerations
0,Noisy Operation,Issue,"The product, a car toy for kids, has been repo...",Implementing Noise Reduction Technology,To provide users with more control over the no...,125,[ 0.0125937 0.00107601 -0.00046086 ... 0.01...,0,"{""Product Improvement 1"": {""Title"": ""Noise Red...",Noise Reduction Solutions for Car Toy,...,,,,,,,,,,
1,Noisy Operation,Issue,"The product, a car toy for kids, has been repo...",Implementing Noise Reduction Technology,"To address the issue of noise, the product des...",109,[ 0.00760074 0.01624805 0.00692047 ... 0.00...,0,"{""Product Improvement 1"": {""Title"": ""Noise Red...",Noise Reduction Solutions for Car Toy,...,,,,,,,,,,
2,Magnetic Drawing Tools and Toys,Fact,"The product, a magnetic drawing board set, has...",Improving the Magnet Design for Better Stability,"To address the issue of noise levels, the prod...",103,[ 0.01147363 0.00799398 -0.00682091 ... -0.00...,0,"{""Product Improvement 1"": {""Title"": ""Noise Red...",Noise Reduction Solutions for Car Toy,...,,,,,,,,,,


In [54]:
import json

for index, row in df.iterrows():
    json_string = row["senior_engineer_solution"]
    if json_string:
        try:
            data = json.loads(json_string)
            for improvement, details in data.items():
                print(f"Product Improvement: {improvement}")
                print(f"Title: {details['Title']}")
                implementation_details = details["Implementation Details for the engineer"]
                print(f"Objective: {implementation_details['Objective']}")
                print("Solution:")
                for solution in implementation_details["Solution"]:
                    print(f"- {solution}")
                print("Steps:")
                for step in implementation_details["Steps"]:
                    print(f"- {step}")
                print("Considerations:")
                for consideration in implementation_details["Considerations"]:
                    print(f"- {consideration}")
                print("-------------------")
        except json.JSONDecodeError as e:
            print(f"Invalid JSON string: {json_string}")
            continue


Product Improvement: Product Improvement 1
Title: Noise Reduction and Sound Dampening
Objective: Reduce the noise level of the product to enhance customer satisfaction and perceived value
Solution:
- Add a volume control feature to allow users to adjust the noise level to their preference
- Incorporate noise-dampening materials like foam or rubber into the product design
Steps:
- Volume Control: Add a control knob on the side of the product that modifies the interaction between the components, thus controlling the noise level
- Noise-Dampening Materials: Include foam or rubber materials inside the product to lessen the impact noise made by the components
Considerations:
- Conduct a feasibility analysis for cost-effective implementation
- Ensure the feature is user-friendly and highlighted in packaging and marketing materials
- Test the modifications to ensure they do not negatively impact the product's performance or durability
-------------------
Product Improvement: Product Improveme

In [55]:
import pandas as pd
import json

# Assuming df is your DataFrame and 'senior_engineer_solution' is the column with the JSON data
for index, row in df.iterrows():
    try:
        json_string = row['senior_engineer_solution']
        if json_string:
            try:
                data = json.loads(json_string)
            except json.JSONDecodeError:
                # Attempt to complete the JSON data by closing all possible opened structures
                json_string += '}' * (json_string.count('{') - json_string.count('}'))
                json_string += ']' * (json_string.count('[') - json_string.count(']'))
                json_string += '"' * (json_string.count('"') % 2)
                try:
                    data = json.loads(json_string)
                except json.JSONDecodeError as e:
                    print(f"Error processing row {index} for improvement with JSON data error after attempting fix: {e}")
                    #print(f"JSON data after attempted fix: {json_string}")
                    # print(f"Row data: {row}")
                    # Skip this row and continue with the next one
                    continue

            for improvement, details in data.items():
                try:
                    implementation_details = details['Implementation Details for the engineer']
                    if implementation_details:
                        if improvement == 'Product Improvement 1':
                            df.at[index, 'improvement_1_title'] = details['Title']
                            df.at[index, 'improvement_1_objective'] = implementation_details['Objective']
                            df.at[index, 'improvement_1_solution'] = ', '.join(implementation_details['Solution'])
                            df.at[index, 'improvement_1_steps'] = ', '.join(implementation_details['Steps'])
                            df.at[index, 'improvement_1_considerations'] = ', '.join(implementation_details['Considerations'])
                        elif improvement == 'Product Improvement 2':
                            df.at[index, 'improvement_2_title'] = details['Title']
                            df.at[index, 'improvement_2_objective'] = implementation_details['Objective']
                            df.at[index, 'improvement_2_solution'] = ', '.join(implementation_details['Solution'])
                            df.at[index, 'improvement_2_steps'] = ', '.join(implementation_details['Steps'])
                            df.at[index, 'improvement_2_considerations'] = ', '.join(implementation_details['Considerations'])
                        elif improvement == 'Product Improvement 3':
                            df.at[index, 'improvement_3_title'] = details['Title']
                            df.at[index, 'improvement_3_objective'] = implementation_details['Objective']
                            df.at[index, 'improvement_3_solution'] = ', '.join(implementation_details['Solution'])
                            df.at[index, 'improvement_3_steps'] = ', '.join(implementation_details['Steps'])
                            df.at[index, 'improvement_3_considerations'] = ', '.join(implementation_details['Considerations'])
                        elif improvement == 'Product Improvement 4':
                            df.at[index, 'improvement_4_title'] = details['Title']
                            df.at[index, 'improvement_4_objective'] = implementation_details['Objective']
                            df.at[index, 'improvement_4_solution'] = ', '.join(implementation_details['Solution'])
                            df.at[index, 'improvement_4_steps'] = ', '.join(implementation_details['Steps'])
                            df.at[index, 'improvement_4_considerations'] = ', '.join(implementation_details['Considerations'])
                except KeyError:
                    print(f"KeyError in row {index}, improvement {improvement}")
                    # print(f"Row data: {row}")

    except Exception as e:
        print(f"Unexpected error processing row {index}: {e}")
        # print(f"Row data: {row}")


Error processing row 41 for improvement with JSON data error after attempting fix: Expecting value: line 1 column 1 (char 0)


In [57]:
df.columns

Index(['cluster_label', 'type', 'cluster_problem_statement',
       'cluster_solution_title', 'cluster_solution_description', 'n_tokens',
       'embedding', 'cluster', 'senior_engineer_solution',
       'improvement_1_title', 'improvement_1_objective',
       'improvement_1_solution', 'improvement_1_steps',
       'improvement_1_considerations', 'improvement_2_title',
       'improvement_2_objective', 'improvement_2_solution',
       'improvement_2_steps', 'improvement_2_considerations',
       'improvement_3_title', 'improvement_3_objective',
       'improvement_3_solution', 'improvement_3_steps',
       'improvement_3_considerations', 'improvement_4_title',
       'improvement_4_objective', 'improvement_4_solution',
       'improvement_4_steps', 'improvement_4_considerations'],
      dtype='object')

In [61]:
df[['cluster_label', 'type', 'cluster_problem_statement',
       'cluster_solution_title', 'cluster_solution_description', 'n_tokens',
       'embedding', 'cluster', 'senior_engineer_solution',
       'improvement_1_title', 'improvement_1_objective',
       'improvement_1_solution', 'improvement_1_steps',
       'improvement_1_considerations']].head(1)

Unnamed: 0,cluster_label,type,cluster_problem_statement,cluster_solution_title,cluster_solution_description,n_tokens,embedding,cluster,senior_engineer_solution,improvement_1_title,improvement_1_objective,improvement_1_solution,improvement_1_steps,improvement_1_considerations
0,Noisy Operation,Issue,"The product, a car toy for kids, has been repo...",Implementing Noise Reduction Technology,To provide users with more control over the no...,125,[ 0.0125937 0.00107601 -0.00046086 ... 0.01...,0,"{""Product Improvement 1"": {""Title"": ""Noise Red...",Noise Reduction and Sound Dampening,Reduce the noise level of the product to enhan...,Add a volume control feature to allow users to...,Volume Control: Add a control knob on the side...,Conduct a feasibility analysis for cost-effect...


In [90]:
df.columns

Index(['cluster_label', 'type', 'cluster_problem_statement',
       'cluster_solution_title', 'cluster_solution_description', 'n_tokens',
       'embedding', 'cluster', 'senior_engineer_solution',
       'improvement_1_title', 'improvement_1_objective',
       'improvement_1_solution', 'improvement_1_steps',
       'improvement_1_considerations', 'improvement_2_title',
       'improvement_2_objective', 'improvement_2_solution',
       'improvement_2_steps', 'improvement_2_considerations',
       'improvement_3_title', 'improvement_3_objective',
       'improvement_3_solution', 'improvement_3_steps',
       'improvement_3_considerations', 'improvement_4_title',
       'improvement_4_objective', 'improvement_4_solution',
       'improvement_4_steps', 'improvement_4_considerations'],
      dtype='object')

In [140]:
# Generates an expanded dataframe dedicated for the solutions
improvement_solution_df = df.copy()

def split_and_remove_suffix(value):
    split_value = value.split('_')[1]
    return split_value

# Perform first melt
melted_df_title = pd.melt(improvement_solution_df,
                          id_vars=['cluster_label', 'type', 'cluster_problem_statement', 'cluster_solution_title', 'cluster'],
                          value_vars=['improvement_1_title', 'improvement_2_title', 'improvement_3_title', 'improvement_4_title'],
                          var_name='improvement_number',
                          value_name='improvement_title')
melted_df_title['improvement_number'] = melted_df_title['improvement_number'].map(split_and_remove_suffix)
melted_df_title.dropna(inplace=True)

# Perform melt for _objective
melted_df_objective = pd.melt(improvement_solution_df,
                              id_vars=['cluster_label', 'type', 'cluster_problem_statement', 'cluster_solution_title', 'cluster'],
                              value_vars=['improvement_1_objective', 'improvement_2_objective', 'improvement_3_objective', 'improvement_4_objective'],
                              var_name='improvement_number',
                              value_name='improvement_objective')
melted_df_objective['improvement_number'] = melted_df_objective['improvement_number'].map(split_and_remove_suffix)
melted_df_objective.dropna(inplace=True)

# Perform melt for _solution
melted_df_solution = pd.melt(improvement_solution_df,
                             id_vars=['cluster_label', 'type', 'cluster_problem_statement', 'cluster_solution_title', 'cluster'],
                             value_vars=['improvement_1_solution', 'improvement_2_solution', 'improvement_3_solution', 'improvement_4_solution'],
                             var_name='improvement_number',
                             value_name='improvement_solution')
melted_df_solution['improvement_number'] = melted_df_solution['improvement_number'].map(split_and_remove_suffix)
melted_df_solution.dropna(inplace=True)

# Perform melt for _steps
melted_df_steps = pd.melt(improvement_solution_df,
                          id_vars=['cluster_label', 'type', 'cluster_problem_statement', 'cluster_solution_title', 'cluster'],
                          value_vars=['improvement_1_steps', 'improvement_2_steps', 'improvement_3_steps', 'improvement_4_steps'],
                          var_name='improvement_number',
                          value_name='improvement_steps')
melted_df_steps['improvement_number'] = melted_df_steps['improvement_number'].map(split_and_remove_suffix)
melted_df_steps.dropna(inplace=True)

# Perform melt for _considerations
melted_df_considerations = pd.melt(improvement_solution_df,
                                   id_vars=['cluster_label', 'type', 'cluster_problem_statement', 'cluster_solution_title', 'cluster'],
                                   value_vars=['improvement_1_considerations', 'improvement_2_considerations', 'improvement_3_considerations', 'improvement_4_considerations'],
                                   var_name='improvement_number',
                                   value_name='improvement_considerations')
melted_df_considerations['improvement_number'] = melted_df_considerations['improvement_number'].map(split_and_remove_suffix)
melted_df_considerations.dropna(inplace=True)

# Perform the joins
melted_df = melted_df_title[['improvement_title']].join(melted_df_objective['improvement_objective'], how='inner')
melted_df = melted_df.join(melted_df_solution['improvement_solution'], how='inner')
melted_df = melted_df.join(melted_df_steps['improvement_steps'], how='inner')
melted_df = melted_df.join(melted_df_considerations['improvement_considerations'], how='inner')

improvements_df = melted_df_title.drop(columns =['improvement_title','improvement_number'])
improvements_df = improvements_df.join(melted_df, how='inner')
improvements_df.drop_duplicates(inplace=True)

del melted_df_title, melted_df_considerations, melted_df_objective, melted_df_solution, melted_df_steps, melted_df, improvement_solution_df

In [141]:
# I want to identify the top 3 solutions for each cluster
improvements_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/processed/cluster_solutions.csv'
improvements_df.to_csv(improvements_path, index=False)

### DE DECIS ULTERIOR IN CE FORMA INCARCAM IN SQL: SOLUTUIONS SAU ALTCEVA?