In [1]:
import pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report



In [6]:
data = pd.read_csv('dataset.csv')

train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 0)

train_data.to_csv("train_subset.csv", index=False)
test_data.to_csv("test_subset.csv", index=False)

In [7]:
def preprocess(df):
    variations = [df]
    for i in range(10000):
        variations.append(df.copy() * i)

    df_extended = pd.concat(variations, axis=1)

    return df_extended

In [10]:
model = GaussianNB()

iteraciones = 0
for batch in pd.read_csv("train_subset.csv", chunksize=1000):
    X = batch.drop('target', axis=1)
    y = batch['target']
    X_processed = preprocess(X)
    model.partial_fit(X_processed, y, classes=y.unique())
    
    iteraciones +=1
    print(f"Training batch {iteraciones} done")

Training batch 1 done
Training batch 2 done
Training batch 3 done
Training batch 4 done
Training batch 5 done
Training batch 6 done
Training batch 7 done
Training batch 8 done


In [11]:
test_data = pd.read_csv("test_subset.csv")
X_test = test_data.drop('target', axis=1)
y_test = test_data['target']

y_predicted = model.predict(preprocess(X_test))

print("Classification report:")
print(classification_report(y_test, y_predicted))

Classification report:
              precision    recall  f1-score   support

           0       0.70      0.85      0.77      1003
           1       0.81      0.63      0.71       997

    accuracy                           0.74      2000
   macro avg       0.76      0.74      0.74      2000
weighted avg       0.76      0.74      0.74      2000

