# Introduction to Large Language Models with GPT & LangChain

## 0. Setup

In [1]:
# 1+1# Install the langchain package
# !pip install langchain==0.0.300

# # Update the typing_extensions package
# !pip install typing_extensions==4.8.0

In [2]:
import os
import openai 

key = 'OPENAI_KEY'

# New version (1.0)
# client = openai.OpenAI(api_key=os.getenv(key))

openai.api_key = os.getenv(key)

In [3]:
import langchain as lc
from langchain.chat_models import ChatOpenAI
from langchain.schema import AIMessage, HumanMessage, SystemMessage

In [4]:
import pandas as pd
import plotly.express as px
from IPython.display import display, Markdown

## 1. Import dataset

In [5]:
file_name = r'Datasets/Electric_Vehicle_Population_Data.csv'
df = pd.read_csv(file_name)

In [6]:
df.head(1)

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,5YJYGDEE1L,King,Seattle,WA,98122.0,2020,TESLA,MODEL Y,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,291,0,37.0,125701579,POINT (-122.30839 47.610365),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033010000.0


In [7]:
new_df = df.groupby(["City", "County", "Model Year", "Make", "Model", 'Electric Vehicle Type']).size().reset_index(name='n_cars')
new_df = new_df.sort_values('n_cars', ascending = False).reset_index(drop=True)

In [8]:
new_df.describe()

Unnamed: 0,Model Year,n_cars
count,30070.0,30070.0
mean,2019.8284,5.914899
std,3.34474,31.361957
min,1997.0,1.0
25%,2018.0,1.0
50%,2021.0,2.0
75%,2023.0,4.0
max,2024.0,2343.0


In [9]:
new_df.describe(include='O')

Unnamed: 0,City,County,Make,Model,Electric Vehicle Type
count,30070,30070,30070,30070,30070
unique,723,196,40,139,2
top,Bothell,King,TESLA,LEAF,Battery Electric Vehicle (BEV)
freq,530,7841,5389,1932,17873


## 2. Asking GPT a question

In [10]:
system_msg_test = """You are a helpful assistant who understands data science.
 You write in a clear language that a ten year old can understand.
 You keep your answers brief."""

user_msg_test = "Tell me some uses of GPT for data analysis."

msgs_test = [
    {"role" : "system" , "content" : system_msg_test},
    {"role" : "user" , "content" : user_msg_test}
]

response = openai.ChatCompletion.create(
  model = "gpt-3.5-turbo",
  messages = msgs_test
)


In [11]:
print(response.choices[0].message.content)

GPT can help in summarizing large amounts of text data, generating text for reports or articles, and even in data cleaning by detecting patterns or anomalies in data.


## 3. Asking about the dataset with LangChain

### LangChain message types

The LangChain message types are names slightly differently from the OpenAI message types.

- `SystemMessage` is the equivalent of OpenAI's `system` message.
- `HumanMessage` is the equivalent of OpenAI's `user` message.

In [12]:
# A description of the dataset
dataset_description = """
You have a dataset about electric cars registered in Washington state, USA in 2020. It is available as a pandas DataFrame named `electric_cars`.

Each row in the dataset represents the count of the number of cars registered within a city, for a particular model.

The dataset contains the following columns.

- `city` (character): The city in which the registered owner resides.
- `county` (character): The county in which the registered owner resides.
- `model_year` (integer): The [model year](https://en.wikipedia.org/wiki/Model_year#United_States_and_Canada) of the car.
- `make` (character): The manufacturer of the car.
- `model` (character): The model of the car.
- `electric_vehicle_type` (character): Either "Plug-in Hybrid Electric Vehicle (PHEV)" or "Battery Electric Vehicle (BEV)".
- `n_cars` (integer): The count of the number of vehicles registered.
"""

In [13]:

# Create a task for the AI. Assign to suggest_questions.
suggest_questions = "Suggest some data analysis questions that could be answered with this dataset."

# Concatenate the dataset description and the request. Assign to msgs_suggest_questions.
msgs_suggest_questions = [
    SystemMessage(content = "You are a data analysis expert."),
    HumanMessage(content = f"{dataset_description}\n\n{suggest_questions}")
]

### Instructions

- Create a `ChatOpenAI` object. Assign to `chat`.
- Pass your message to GPT. Assign to `rsps_suggest_questions`.
- Print the response object and the contents of the response.
- Print the type of the response

In [14]:
# Create a ChatOpenAI object. Assign to chat.
chat = ChatOpenAI(openai_api_key=os.getenv(key))

# Pass your message to GPT. Assign to rsps_suggest_questions.
rsps_suggest_questions = chat(msgs_suggest_questions)

# Print the response
print("The whole response\n")
print(rsps_suggest_questions)


print("\n----\n")

# Print just the response's content
print("Just the response's content\n")
print(rsps_suggest_questions.content)


print("\n----\n")

# Print the type of the response
print("The type of the response. \n")
print(type(rsps_suggest_questions))

The whole response

content='1. What is the distribution of electric vehicle types (PHEV vs BEV) in Washington state in 2020?\n2. Which city in Washington state had the highest number of electric cars registered in 2020?\n3. What are the most popular electric car models registered in Washington state in 2020?\n4. How does the number of electric cars registered vary by county in Washington state in 2020?\n5. Is there a correlation between the model year of the electric cars and the number of cars registered?\n6. What are the top electric car manufacturers represented in the dataset for Washington state in 2020?\n7. Can we identify any trends or patterns in the registration of electric cars over the course of the year 2020?\n8. Are there any cities or counties where a specific type of electric vehicle (PHEV or BEV) is more popular than the other?\n9. How does the distribution of electric cars vary between urban and rural areas in Washington state?\n10. Is there a relationship between the

## Task 4: Hold a conversation with GPT

In [15]:
msgs_suggest_questions

[SystemMessage(content='You are a data analysis expert.', additional_kwargs={}),
 HumanMessage(content='\nYou have a dataset about electric cars registered in Washington state, USA in 2020. It is available as a pandas DataFrame named `electric_cars`.\n\nEach row in the dataset represents the count of the number of cars registered within a city, for a particular model.\n\nThe dataset contains the following columns.\n\n- `city` (character): The city in which the registered owner resides.\n- `county` (character): The county in which the registered owner resides.\n- `model_year` (integer): The [model year](https://en.wikipedia.org/wiki/Model_year#United_States_and_Canada) of the car.\n- `make` (character): The manufacturer of the car.\n- `model` (character): The model of the car.\n- `electric_vehicle_type` (character): Either "Plug-in Hybrid Electric Vehicle (PHEV)" or "Battery Electric Vehicle (BEV)".\n- `n_cars` (integer): The count of the number of vehicles registered.\n\n\nSuggest some

In [16]:
msgs_python_top_models = msgs_suggest_questions + [
    rsps_suggest_questions
]

In [17]:
# Append the response and a new message to the previous messages. 
# Assign to msgs_python_top_models.
msgs_python_top_models = msgs_suggest_questions + [
    rsps_suggest_questions,
    HumanMessage(content= "Write python code to find the top make/model combinations of electric cars in Washington state")
]

# Pass your message to GPT. Assign to rsps_python_top_models.
rsps_python_top_models = chat(msgs_python_top_models)

# Display the response's Markdown content

In [18]:
msgs_python_top_models

[SystemMessage(content='You are a data analysis expert.', additional_kwargs={}),
 HumanMessage(content='\nYou have a dataset about electric cars registered in Washington state, USA in 2020. It is available as a pandas DataFrame named `electric_cars`.\n\nEach row in the dataset represents the count of the number of cars registered within a city, for a particular model.\n\nThe dataset contains the following columns.\n\n- `city` (character): The city in which the registered owner resides.\n- `county` (character): The county in which the registered owner resides.\n- `model_year` (integer): The [model year](https://en.wikipedia.org/wiki/Model_year#United_States_and_Canada) of the car.\n- `make` (character): The manufacturer of the car.\n- `model` (character): The model of the car.\n- `electric_vehicle_type` (character): Either "Plug-in Hybrid Electric Vehicle (PHEV)" or "Battery Electric Vehicle (BEV)".\n- `n_cars` (integer): The count of the number of vehicles registered.\n\n\nSuggest some

In [19]:
display(Markdown(rsps_python_top_models.content))

Certainly! Here's the Python code to find the top make/model combinations of electric cars in Washington state from the `electric_cars` DataFrame:

```python
# Group the DataFrame by 'make' and 'model' columns and sum the 'n_cars' for each group
top_make_model_combinations = electric_cars.groupby(['make', 'model'])['n_cars'].sum().reset_index()

# Sort the DataFrame in descending order based on the sum of 'n_cars'
top_make_model_combinations = top_make_model_combinations.sort_values(by='n_cars', ascending=False)

# Display the top make/model combinations
print(top_make_model_combinations.head())
```

This code will group the data by the 'make' and 'model' columns, summing the 'n_cars' for each unique make/model combination. It then sorts the results in descending order based on the total number of cars registered and displays the top make/model combinations.

## Task 5: Execute the Code Provided by GPT

In [20]:
new_df.head(1)

Unnamed: 0,City,County,Model Year,Make,Model,Electric Vehicle Type,n_cars
0,Seattle,King,2023,TESLA,MODEL Y,Battery Electric Vehicle (BEV),2343


In [21]:
top_make_model_combinations = new_df.groupby(['Make', 'Model']).size().reset_index(name='Count').sort_values(by='Count', ascending=False)

In [22]:
top_make_model_combinations

Unnamed: 0,Make,Model,Count
102,NISSAN,LEAF,1932
117,TESLA,MODEL 3,1586
118,TESLA,MODEL S,1550
120,TESLA,MODEL Y,1206
39,CHEVROLET,VOLT,1193
...,...,...,...
20,BMW,745LE,2
14,BENTLEY,BENTAYGA,1
36,CHEVROLET,S-10 PICKUP,1
104,PORSCHE,918,1


## Task 6: Continue the Conversation to Create a Plot

Doing more analysis with GPT assistance is simply a case of continuing the conversation by appending new `HumanMessage` prompts to the message list, and calling the `chat()` function.

The output from GPT is random, but when doing data analysis this isn't always desirable since you'd like your results to be reproducible. With large language models, the amount of randomness can be controled with a parameter known as "temperature".

- `temperature` controls the randomness of the response. It ranges from `0` to `2` with zero meaning minimal randomness (to make it easier to reproduce results) and two meaning maximum randomness (often gives weird responses). If you use the OpenAI API directly, the default is `1`, but using LangChain reduces the default value to `0.7`. 

### Instructions

- Work through the previous conversation flow again, appending the previous response and a new request for Python code to draw a bar plot of the total count of electric cars by model year, with bars colored by electric vehicle type.
    - The solution asks for Plotly Express code, but you can pick any Python data viz package you prefer.

In [23]:
chat0 = ChatOpenAI(openai_api_key=os.getenv(key), temperature=0)

In [24]:
msgs_python_plot = msgs_python_top_models + [
    rsps_python_top_models,
    HumanMessage(content="Write some python code to draw a bar plot of the total count of electric cars by model year, with bars colored by electric vehicle type. Use the plotly express package")
]

rsps_python_plot = chat0(msgs_python_plot)

In [25]:
Markdown(rsps_python_plot.content)

Here's the Python code to draw a bar plot of the total count of electric cars by model year, with bars colored by electric vehicle type using the Plotly Express package:

```python
import plotly.express as px

# Group the DataFrame by 'model_year' and 'electric_vehicle_type' columns and sum the 'n_cars' for each group
total_count_by_year_type = electric_cars.groupby(['model_year', 'electric_vehicle_type'])['n_cars'].sum().reset_index()

# Create the bar plot using Plotly Express
fig = px.bar(total_count_by_year_type, x='model_year', y='n_cars', color='electric_vehicle_type',
             labels={'model_year': 'Model Year', 'n_cars': 'Total Count of Electric Cars'},
             title='Total Count of Electric Cars by Model Year, Colored by Electric Vehicle Type')

# Show the plot
fig.show()
```

This code will first group the data by 'model_year' and 'electric_vehicle_type', summing the 'n_cars' for each unique combination. It then uses Plotly Express to create a bar plot where the bars are colored based on the electric vehicle type.

## Task 7: Execute the Code Provided by GPT to See Your Plot

### Instructions

Run the code and check that the plot is correct.

- Run the bar plot code generated by GPT.
- _Check that the output is suitable. If not, try changing your prompt in the previous task to improve the output (this is prompt engineering, which you'll see more of in the next code-along project in the series)._

In [26]:
# Paste the code generated by GPT and run it
import plotly.express as px

# Assuming electric_cars is the pandas DataFrame containing the dataset
count_by_year_type = new_df.groupby(['Model Year', 'Electric Vehicle Type']).sum().reset_index()

In [27]:
count_by_year_type.head(1)

Unnamed: 0,Model Year,Electric Vehicle Type,City,County,Make,Model,n_cars
0,1997,Battery Electric Vehicle (BEV),Marysville,Snohomish,CHEVROLET,S-10 PICKUP,1


In [28]:
# Creating the bar plot using Plotly Express
fig = px.bar(count_by_year_type, x='Model Year', y='n_cars', color='Electric Vehicle Type',
             labels={'n_cars': 'Total Count of Electric Cars', 'Model Year': 'Model Year'},
             title='Total Count of Electric Cars by Model Year')

# Displaying the plot
fig.show()