# PRE-REQUISITES:
- Open an R session and run: install.packages("VGAM")
- conda install rpy2

In [2]:
import pandas as pd
import numpy as np
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from rpy2.robjects.conversion import localconverter
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
desired_rank = 2

np.random.seed(42)
data = {
    'x1': np.random.rand(100),
    'x2': np.random.rand(100),
    'y': np.random.choice([0, 1], size=100)
}

df = pd.DataFrame(data)

In [4]:
# Convert the pandas DataFrame to an R DataFrame
pandas2ri.activate()
r_df = pandas2ri.py2rpy(df)

# Load the VGAM package
vgam = importr('VGAM')

# Create a formula in R
formula = ro.Formula('y ~ x1 + x2')
#formula = ro.Formula('y ~ x1 + x2 + 1')


# Perform the categorical regression
#vglm_model = vgam.vglm(formula, family=vgam.multinomial(), data=r_df)
vglm_model = vgam.vglm(formula, family=vgam.multinomial(), data=r_df, rank=desired_rank)

In [5]:
# Predict on new data (using the same DataFrame for simplicity)
predictions_r = ro.r['predict'](vglm_model, newdata=r_df, type="response")

# Convert R object to numpy array or pandas DataFrame
with localconverter(ro.default_converter + pandas2ri.converter):
    predictions_np = ro.conversion.rpy2py(predictions_r)

# If the predictions are a matrix, convert them to a pandas DataFrame
if isinstance(predictions_np, np.ndarray):
    predictions_df = pd.DataFrame(predictions_np)

print(predictions_df)


           0         1
0   0.353739  0.646261
1   0.432570  0.567430
2   0.381396  0.618604
3   0.438463  0.561537
4   0.576089  0.423911
..       ...       ...
95  0.412460  0.587540
96  0.496772  0.503228
97  0.546406  0.453594
98  0.584398  0.415602
99  0.551525  0.448475

[100 rows x 2 columns]


In [6]:
# Assuming 'y' in your original dataframe is the true labels
true_labels = df['y']

# If predictions are probabilities, convert them to the predicted class by taking the argmax
predicted_labels = predictions_df.idxmax(axis=1).astype(int)  # Convert to integer class labels

# Evaluate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f'Accuracy: {accuracy}')

# Confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)
print('Confusion Matrix:')
print(conf_matrix)

# Detailed classification report
class_report = classification_report(true_labels, predicted_labels)
print('Classification Report:')
print(class_report)


Accuracy: 0.57
Confusion Matrix:
[[16 29]
 [14 41]]
Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.36      0.43        45
           1       0.59      0.75      0.66        55

    accuracy                           0.57       100
   macro avg       0.56      0.55      0.54       100
weighted avg       0.56      0.57      0.55       100

