In [1]:
from google.colab import drive
import os
import glob
import pandas as pd

drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/data-science-veiculos-poluicao'
os.chdir(folder_path)

print(os.listdir())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['senatran', 'iema', 'senastran_tipo.ipynb', 'MSysObjects.csv', '2025-07-20 18-33-52.mkv', 'iema-agg', 'senatran-agg', 'cidades_lookup.csv', 'cidades.ipynb', 'iema.ipynb', 'analise_1.ipynb', 'TCC ESALQ USP DATA SCIENCE.gdoc', 'analise_2.ipynb']


In [2]:
file_path = 'senatran-agg/marca_modelo_interpolado/marca_modelo_senatran_interpolated.csv'
df_senatran = pd.read_csv(file_path)
display(df_senatran.head())

Unnamed: 0,cidade,id,UF,category,ano,mes,qtd
0,AMERICANA,SP01,SAO PAULO,BUS,2015,1,51.0
1,AMERICANA,SP01,SAO PAULO,BUS,2015,2,52.0
2,AMERICANA,SP01,SAO PAULO,BUS,2015,3,51.0
3,AMERICANA,SP01,SAO PAULO,BUS,2015,4,52.5
4,AMERICANA,SP01,SAO PAULO,BUS,2015,5,54.0


In [7]:
# Create a new category 'OUTROS' for all non-SUV categories
df_senatran_suv = df_senatran.copy()
df_senatran_suv['category'] = df_senatran_suv['category'].apply(lambda x: 'SUV' if x == 'SUV' else 'OUTROS')

# Group by the new category and sum the 'qtd'
df_senatran_suv = df_senatran_suv.groupby(['cidade', 'id',	'UF',	'category',	'ano',	'mes'])['qtd'].sum().reset_index()
display(df_senatran_suv.head())

Unnamed: 0,cidade,id,UF,category,ano,mes,qtd
0,AMERICANA,SP01,SAO PAULO,OUTROS,2015,1,145055.0
1,AMERICANA,SP01,SAO PAULO,OUTROS,2015,2,145325.0
2,AMERICANA,SP01,SAO PAULO,OUTROS,2015,3,145679.0
3,AMERICANA,SP01,SAO PAULO,OUTROS,2015,4,145905.5
4,AMERICANA,SP01,SAO PAULO,OUTROS,2015,5,146132.0


In [6]:
file_path = 'iema-agg/iema-mensal.csv'
df_iema = pd.read_csv(file_path)
display(df_iema.head())

df_co = df_iema[df_iema['Poluente'] == 'CO']
df_co = df_co[["Month", "Codigo", "Valor"]]
display(df_co.head())

Unnamed: 0,Month,Estacao,Codigo,Poluente,Unidade,Tipo,Valor
0,2015-01,Americana - Vila Santa Maria,SP01,MP10,ug/m3,automatica,24.38965
1,2015-01,Americana - Vila Santa Maria,SP01,O3,ug/m3,automatica,61.180028
2,2015-01,Anchieta Centro,ES12,MP10,ug/m3,automatica,40.129945
3,2015-01,Anchieta Centro,ES12,MP2.5,ug/m3,automatica,6.615899
4,2015-01,Anchieta Centro,ES12,PTS,ug/m3,automatica,51.927136


Unnamed: 0,Month,Codigo,Valor
15,2015-01,BA17,0.16083
20,2015-01,BA20,0.515109
25,2015-01,MG01,0.187132
42,2015-01,RJ94,0.403143
45,2015-01,MG05,0.490738


In [9]:
df_co['Month'] = pd.to_datetime(df_co['Month'])
df_co['ano'] = df_co['Month'].dt.year
df_co['mes'] = df_co['Month'].dt.month
display(df_co.head())

Unnamed: 0,Month,Codigo,Valor,ano,mes
15,2015-01-01,BA17,0.16083,2015,1
20,2015-01-01,BA20,0.515109,2015,1
25,2015-01-01,MG01,0.187132,2015,1
42,2015-01-01,RJ94,0.403143,2015,1
45,2015-01-01,MG05,0.490738,2015,1


In [16]:
df_merged = pd.merge(df_senatran_suv, df_co, left_on=['ano', 'mes', 'id'], right_on=['ano', 'mes', 'Codigo'])

df = df_merged[['ano', 'mes', 'cidade', 'category', 'UF', 'id', 'Valor', 'qtd']]

display(df.head())

Unnamed: 0,ano,mes,cidade,category,UF,id,Valor,qtd
0,2015,5,ANCHIETA,OUTROS,ESPIRITO SANTO,ES12,387.034621,9321.0
1,2015,6,ANCHIETA,OUTROS,ESPIRITO SANTO,ES12,373.87924,9343.333333
2,2015,7,ANCHIETA,OUTROS,ESPIRITO SANTO,ES12,482.061063,9365.666667
3,2015,8,ANCHIETA,OUTROS,ESPIRITO SANTO,ES12,347.147524,9388.0
4,2015,9,ANCHIETA,OUTROS,ESPIRITO SANTO,ES12,416.226671,9419.5


# Task
Analyze the provided data to determine if increases in SUV count are associated with increases in pollution levels.

## Data preparation

### Subtask:
Aggregate the data by city and month, then group by city and category.


**Reasoning**:
Create a 'date' column and aggregate the DataFrame by 'date' and 'cidade', summing 'Valor' and 'qtd'.



In [31]:
df['date'] = pd.to_datetime(df['ano'].astype(str) + '-' + df['mes'].astype(str))

df_valor_agg_city = df.groupby(['date', 'cidade'])[['Valor', 'qtd']].sum().reset_index()
df_valor_agg_city = df_valor_agg_city.rename(columns={'Valor': 'Valor_agregado_cidade'})
display(df_valor_agg_city.head())

ValueError: 'date' is both an index level and a column label, which is ambiguous.

**Reasoning**:
The error indicates that 'date' is both an index and a column. Reset the index of the dataframe before creating the 'date' column to avoid ambiguity.



In [32]:
df = df.reset_index()
df['date'] = pd.to_datetime(df['ano'].astype(str) + '-' + df['mes'].astype(str))

df_valor_agg_city = df.groupby(['date', 'cidade'])[['Valor', 'qtd']].sum().reset_index()
df_valor_agg_city = df_valor_agg_city.rename(columns={'Valor': 'Valor_agregado_cidade'})
display(df_valor_agg_city.head())

ValueError: cannot insert date, already exists

**Reasoning**:
The error indicates that the 'date' column already exists after resetting the index. Drop the existing 'date' column before creating a new one.



In [41]:
if 'date' in df.columns:
    df = df.drop(columns=['date'])

df = df.reset_index()
df['date'] = pd.to_datetime(df['ano'].astype(str) + '-' + df['mes'].astype(str))

df_valor_agg_city = df.groupby(['date', 'cidade'])[['Valor', 'qtd']].mean().reset_index()
df_valor_agg_city = df_valor_agg_city.rename(columns={'Valor': 'Valor_agregado_cidade'})
display(df_valor_agg_city.head())

Unnamed: 0,date,cidade,Valor_agregado_cidade,qtd
0,2015-01-01,ANCHIETA,216.541727,3139.75
1,2015-01-01,ARAUCARIA,261.30172,32136.0
2,2015-01-01,BELO HORIZONTE,0.427601,744088.0
3,2015-01-01,BETIM,0.485515,74410.5
4,2015-01-01,CAMACARI,0.455696,32498.0


**Reasoning**:
Aggregate the DataFrame by 'date', 'cidade', and 'category', summing the 'Valor' and 'qtd' columns, then reset the index.



In [42]:
df_agg = df.groupby(['date', 'cidade', 'category'])[['Valor', 'qtd']].sum().reset_index()
display(df_agg.head())

Unnamed: 0,date,cidade,category,Valor,qtd
0,2015-01-01,ANCHIETA,OUTROS,866.166909,24900.0
1,2015-01-01,ANCHIETA,SUV,866.166909,218.0
2,2015-01-01,ARAUCARIA,OUTROS,522.603439,127188.0
3,2015-01-01,ARAUCARIA,SUV,522.603439,1356.0
4,2015-01-01,BELO HORIZONTE,OUTROS,0.855202,2927728.0


## Feature engineering

### Subtask:
Create a time series feature for the `Valor` (average pollutant concentration) and `qtd` (quantity of vehicles) columns.


**Reasoning**:
Set 'date' and 'cidade' as the index of `df_agg` and then select the 'Valor' and 'qtd' columns for `independent` and the 'Valor' column for `dependent`.



In [43]:
df_agg = df_agg.set_index(['date', 'cidade'])
independent = df_agg[['qtd', 'category']]
dependent = df_agg['Valor']

## Model selection

### Subtask:
Choose a suitable regression model to analyze the relationship between the time series data.


**Reasoning**:
I need to choose a regression model suitable for time series data with both numerical and categorical independent variables. Given the nature of the data, a linear regression model with dummy variables for the categorical feature and potentially incorporating time-based features or using a time series specific model would be appropriate. A standard linear regression model can handle this by treating the categorical variable as dummy variables. I will prepare the data for this model.



In [44]:
# Convert categorical variable 'category' into dummy variables
independent_dummies = pd.get_dummies(independent, columns=['category'], drop_first=True)

# Combine the dependent and independent variables for modeling
df_model = pd.concat([dependent, independent_dummies], axis=1)

display(df_model.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,Valor,qtd,category_SUV
date,cidade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01,ANCHIETA,866.166909,24900.0,False
2015-01-01,ANCHIETA,866.166909,218.0,True
2015-01-01,ARAUCARIA,522.603439,127188.0,False
2015-01-01,ARAUCARIA,522.603439,1356.0,True
2015-01-01,BELO HORIZONTE,0.855202,2927728.0,False


**Reasoning**:
The data has been prepared for a regression model by creating dummy variables for the categorical feature. A Linear Regression model is suitable for analyzing the relationship between the time series of vehicle quantities (including the categorical distinction) and the time series of pollutant concentrations. I will now fit a Linear Regression model to the data.



In [45]:
from sklearn.linear_model import LinearRegression

# Define the independent variables (features) and the dependent variable (target)
X = df_model[['qtd', 'category_SUV']]
y = df_model['Valor']

# Initialize and fit the Linear Regression model
model = LinearRegression()
model.fit(X, y)

print("Linear Regression model fitted successfully.")

Linear Regression model fitted successfully.


**Reasoning**:
The Linear Regression model has been fitted. To evaluate the model and understand the relationship between the variables, I will display the model's coefficients and intercept. This will show how 'qtd' and 'category_SUV' influence the 'Valor' (pollutant concentration).



In [46]:
# Display the model coefficients and intercept
print("Model Intercept:", model.intercept_)
print("Model Coefficients:", model.coef_)


Model Intercept: 361.82408491078775
Model Coefficients: [9.04244219e-05 2.33827486e+02]


## Model evaluation

### Subtask:
Evaluate the model's performance to determine the significance of the relationship between SUV count and pollution.


**Reasoning**:
Calculate and print the Mean Squared Error and R-squared score to evaluate the model's performance.



In [47]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict the 'Valor' using the fitted model
y_pred = model.predict(X)

# Calculate the Mean Squared Error
mse = mean_squared_error(y, y_pred)

# Calculate the R-squared score
r2 = r2_score(y, y_pred)

# Print the results
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2) score:", r2)


Mean Squared Error (MSE): 880821.5135109796
R-squared (R2) score: 0.32834779537113634


**Reasoning**:
Interpret the calculated MSE and R-squared values in the context of the problem.



In [48]:
print("\nInterpretation of Model Performance:")
print(f"The Mean Squared Error (MSE) of {mse:.2f} indicates the average squared difference between the actual and predicted CO pollution values. A lower MSE indicates a better fit of the model to the data.")
print(f"The R-squared score of {r2:.4f} means that approximately {r2*100:.2f}% of the variance in the CO pollution levels can be explained by the linear model with 'qtd' and 'category_SUV' as predictors.")
print("An R-squared of 0.3283 suggests that while the model captures some of the variability in CO levels, a significant portion remains unexplained by these features alone. This implies that other factors not included in this model also influence CO pollution.")


Interpretation of Model Performance:
The Mean Squared Error (MSE) of 880821.51 indicates the average squared difference between the actual and predicted CO pollution values. A lower MSE indicates a better fit of the model to the data.
The R-squared score of 0.3283 means that approximately 32.83% of the variance in the CO pollution levels can be explained by the linear model with 'qtd' and 'category_SUV' as predictors.
An R-squared of 0.3283 suggests that while the model captures some of the variability in CO levels, a significant portion remains unexplained by these features alone. This implies that other factors not included in this model also influence CO pollution.


## Summary:

### Data Analysis Key Findings

*   The fitted linear regression model has an intercept of approximately 361.82.
*   The coefficient for 'qtd' (quantity of vehicles) is approximately 9.04e-05.
*   The coefficient for 'category\_SUV' is approximately 233.83.
*   The Mean Squared Error (MSE) of the model is approximately 880,821.51.
*   The R-squared score of the model is approximately 0.3283. This indicates that about 32.83% of the variance in CO pollution levels can be explained by the model using 'qtd' and 'category\_SUV' as predictors.

### Insights or Next Steps

*   The positive coefficient for 'category\_SUV' suggests that, holding other factors constant, being an SUV is associated with an increase in the pollutant 'Valor' (likely CO concentration). The coefficient value of approximately 233.83 indicates the estimated average increase in 'Valor' attributed to the SUV category in this model.
*   Since the R-squared value is relatively low (0.3283), a significant portion of the variability in CO pollution levels remains unexplained by vehicle quantity and the SUV category alone. Future analysis should consider incorporating additional factors that might influence pollution, such as other vehicle types, environmental conditions, traffic density, or regulatory changes.
