In [56]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importação de bibliotecas

In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [58]:
df = pd.read_csv('/content/drive/MyDrive/projetos/portoCase/resultado_select_exe1.csv', sep=';', on_bad_lines='warn')  # or 'error' to halt execution

In [59]:
df.head()

Unnamed: 0,matricula,AnoMes_Formatted,leads,trabalhado,abordado,negociado,rec_orcamento,cotado,transm_bru,transm_liq,...,%Cotado,%Conversão_Bruta,%Conversão_Líquida,%Eficiência,AnoMes_Formatted:1,monitoria,sac,tempo_logado,Status_Meta,Operador_Sequencial
0,4400000,202101,4,4,4,4,2,2,0,0,...,5,0,0,0,202101,,,,abaixo da meta,1
1,4400000,202107,14,14,14,8,4,4,0,0,...,285714286,0,0,0,202107,,,,abaixo da meta,1
2,4400001,202102,12,12,12,12,2,10,2,2,...,833333333,2,2,166666667,202102,,,,abaixo da meta,1
3,4400001,202103,12,12,12,4,0,4,0,0,...,333333333,0,0,0,202103,,,,abaixo da meta,1
4,4400001,202109,14,14,14,12,4,8,0,0,...,571428571,0,0,0,202109,,,,abaixo da meta,1


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1827 entries, 0 to 1826
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   matricula            1827 non-null   int64  
 1   AnoMes_Formatted     1827 non-null   int64  
 2   leads                1827 non-null   int64  
 3   trabalhado           1827 non-null   int64  
 4   abordado             1827 non-null   int64  
 5   negociado            1827 non-null   int64  
 6   rec_orcamento        1827 non-null   int64  
 7   cotado               1827 non-null   int64  
 8   transm_bru           1827 non-null   int64  
 9   transm_liq           1827 non-null   int64  
 10  rec_canc_transm      1827 non-null   int64  
 11  premio_transm        1827 non-null   int64  
 12  premio_rec_canc      1827 non-null   int64  
 13  premio_emi           1827 non-null   int64  
 14  ticket_med_bru       1827 non-null   object 
 15  ticket_med_lid       1827 non-null   o

# Pré-processamento dos dados


In [60]:
# Remover colunas que não são relevantes para o modelo
data = df.drop(columns=['matricula', 'AnoMes_Formatted', 'monitoria', 'sac', 'tempo_logado', 'Operador_Sequencial'])


In [61]:
# Tratar valores nulos, se houver
data = data.fillna(0)

In [62]:
# Converter variáveis categóricas em numéricas (se necessário)
label_encoder = LabelEncoder()
data['Status_Meta'] = label_encoder.fit_transform(data['Status_Meta'])

In [63]:
# Listar as colunas que contêm números como strings com vírgula decimal
num_cols = ['%Trabalhado', '%Abordado', '%Cotado', '%Conversão_Bruta', '%Conversão_Líquida', '%Eficiência']

# Converter essas colunas para float, substituindo vírgulas por pontos, mas somente se forem do tipo object
for col in num_cols:
    if data[col].dtype == 'object':
        data[col] = data[col].str.replace(',', '.').astype(float)

In [64]:
# Definir as variáveis independentes (X) e dependente (y)
X = data[['leads', 'trabalhado', 'abordado', 'rec_orcamento', 'cotado', 'transm_bru', 'transm_liq', 'premio_transm', '%Trabalhado', '%Abordado', '%Cotado']]
y = data['negociado']

In [65]:
# Dividir os dados em conjuntos de treino e teste (80% treino, 20% teste)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Modelos de Análise Preditivas

In [66]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Definir o modelo base
rf = RandomForestRegressor(random_state=42)

# Definir os parâmetros para ajuste
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Configurar o GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

# Treinar com os dados de treino
grid_search.fit(X_train, y_train)

# Melhor conjunto de hiperparâmetros encontrados
print(f"Melhores parâmetros: {grid_search.best_params_}")

# Avaliar o modelo otimizado
best_rf = grid_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)

# Avaliar o desempenho do modelo
from sklearn.metrics import mean_squared_error, r2_score
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest - MSE: {mse_rf}, R²: {r2_rf}")


Melhores parâmetros: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Random Forest - MSE: 0.4289688524590162, R²: 0.9902190654087254


In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Padronizar os dados (requisito para Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Configurar e treinar o modelo Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)

# Prever e avaliar
y_pred_logreg = logreg.predict(X_test_scaled)
mse_logreg = mean_squared_error(y_test, y_pred_logreg)
r2_logreg = r2_score(y_test, y_pred_logreg)
print(f"Logistic Regression - MSE: {mse_logreg}, R²: {r2_logreg}")


Logistic Regression - MSE: 2.120218579234973, R²: 0.951656818149323


In [68]:
import xgboost as xgb

# Configurar e treinar o modelo XGBoost
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, seed=42)
xg_reg.fit(X_train, y_train)

# Prever e avaliar
y_pred_xgb = xg_reg.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"XGBoost - MSE: {mse_xgb}, R²: {r2_xgb}")

XGBoost - MSE: 0.22881534521070476, R²: 0.9947827458381653


In [69]:
!pip install ace_tools



In [70]:
import pandas as pd

model_results = {
    'Model': ['Random Forest (GridSearchCV)', 'Logistic Regression', 'XGBoost'],
    'MSE': [mse_rf, mse_logreg, mse_xgb],  # Simulated MSE values for illustration
    'R²': [r2_rf, r2_logreg, r2_xgb]  # Simulated R² values for illustration
}

# Criar um DataFrame para exibir os resultados
results_df = pd.DataFrame(model_results)

# Display the DataFrame directly using pandas.
# The ace_tools module is likely not a standard package and may be unavailable.
print("Model Comparison Results:")
print(results_df)

Model Comparison Results:
                          Model       MSE        R²
0  Random Forest (GridSearchCV)  0.428969  0.990219
1           Logistic Regression  2.120219  0.951657
2                       XGBoost  0.228815  0.994783
