|
12 | 12 |
|
13 | 13 |
|
14 | 14 | SYSTEM_PROMPT = '''You are a data scientist to help user infer data types based off the table provided by the user.
|
15 |
| -Given a dataset provided by the user, identify their type and semantic type, and provide a very short summary of the dataset. |
| 15 | +Given a dataset provided by the user, |
| 16 | +1. identify their type and semantic type |
| 17 | +2. provide a very short summary of the dataset. |
| 18 | +3. provide a list of (5-10) explorative questions that can help users get started with data visualizations. |
16 | 19 |
|
17 | 20 | Types to consider include: string, number, date
|
18 | 21 | Semantic types to consider include: Location, Year, Month, Day, Date, Time, DateTime, Range, Duration, Name, Percentage, String, Number
|
|
34 | 37 | "field2": {"type": ..., "semantic_type": ..., "sort_order": null},
|
35 | 38 | ...
|
36 | 39 | },
|
37 |
| - "data summary": ... // a short summary of the data |
| 40 | + "data summary": ... // a short summary of the data, |
| 41 | + "explorative_questions": [...], // a list of explorative questions that can help users get started with data visualizations |
38 | 42 | }
|
39 | 43 | ```
|
40 | 44 | '''
|
|
76 | 80 | "total": {"type": "number", "semantic_type": "Number", "sort_order": null},
|
77 | 81 | "group": {"type": "string", "semantic_type": "Range", "sort_order": ["<10000", "10000 to 14999", "15000 to 24999", "25000 to 34999", "35000 to 49999", "50000 to 74999", "75000 to 99999", "100000 to 149999", "150000 to 199999", "200000+"]}
|
78 | 82 | },
|
79 |
| - "data summary": "The dataset contains information about income distribution across different states in the USA. It includes fields for state names, regions, state IDs, percentage of total income, total income, and income groups." |
| 83 | + "data summary": "The dataset contains information about income distribution across different states in the USA. It includes fields for state names, regions, state IDs, percentage of total income, total income, and income groups.", |
| 84 | + "explorative_questions": [ |
| 85 | + "What is the average income across different states?", |
| 86 | + "What is the distribution of income across different regions?", |
| 87 | + "What is the relationship between income and state ID?", |
| 88 | + "What is the relationship between income and region?" |
| 89 | + ] |
80 | 90 | }
|
81 | 91 | ```
|
82 | 92 |
|
|
121 | 131 | "sort_order": null
|
122 | 132 | }
|
123 | 133 | },
|
124 |
| - "data_summary": "This dataset contains weather information for the cities of Seattle and Atlanta. The fields include the date, city name, and temperature readings. The 'Date' field represents dates in a string format, the 'City' field represents city names, and the 'Temperature' field represents temperature values in integer format." |
| 134 | + "data_summary": "This dataset contains weather information for the cities of Seattle and Atlanta. The fields include the date, city name, and temperature readings. The 'Date' field represents dates in a string format, the 'City' field represents city names, and the 'Temperature' field represents temperature values in integer format.", |
| 135 | + "explorative_questions": [ |
| 136 | + "What is the average temperature across different cities?", |
| 137 | + "What is the distribution of temperature across different dates?", |
| 138 | + "What is the relationship between temperature and city?", |
| 139 | + "What is the relationship between temperature and date?" |
| 140 | + ] |
125 | 141 | }```'''
|
126 | 142 |
|
127 | 143 | class DataLoadAgent(object):
|
|
0 commit comments