In [130]:
#!pip install scikit-plot

In [131]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [132]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)

In [133]:
df_train = pd.read_csv('train_log.csv')

In [134]:
df_train.head()

Unnamed: 0,originalTitle,rating,startYear,runtimeMinutes,awardWins,numVotes,totalImages,totalVideos,totalCredits,titleType,AwNmExWins,canHaveEpisodes,isAdult,numRegions,countryOfOrigin,genres,ratingMean,ReviewsTotal
0,Neogenic Nightmare Chapter 3: Hydro-Man,"(7, 8]",1995,40,0.0,6.659294,0.693147,0.0,21,tvEpisode,0.0,False,False,1,['US'],"Action,Adventure,Animation",7.5,0.693147
1,Looping,"(5, 6]",1982,28,0.0,2.484907,0.693147,0.0,1,videoGame,0.0,False,False,1,['XWG'],Action,5.5,0.0
2,Idealnaya para,"(5, 6]",1992,92,0.0,3.663562,0.693147,0.0,24,movie,0.0,False,False,1,['RU'],Comedy,5.5,0.693147
3,MasterChef Celebrity México,"(5, 6]",2021,53,0.0,2.772589,3.135494,0.0,56,tvSeries,0.0,True,False,1,['MX'],Reality-TV,5.5,0.0
4,Seutateueob,"(7, 8]",2020,80,0.693147,9.662625,5.852202,1.098612,47,tvSeries,0.693147,True,False,32,['KR'],"Comedy,Drama,Romance",7.5,6.124683


In [135]:
mapping = {
    'short': 'short',
    'tvShort': 'short',
    'movie': 'movie',
    'tvMovie': 'movie',
    'tvSpecial': 'special',
    'videogame': 'videogame',
    'video': 'video',
    'tvMiniseries': 'tvseries_related',
    'tvSeries': 'tvseries_related',
    'tvEpisode': 'tvseries_related'
}

df_train['grouped_titleType'] = df_train['titleType'].replace(mapping)

In [136]:
# X = df_train.values
# y = np.array(df_train['titleType'])

In [137]:
del df_train['originalTitle']
del df_train['rating']
del df_train['genres']
del df_train['countryOfOrigin']
del df_train['isAdult']
del df_train['canHaveEpisodes']
del df_train['titleType'] 

## Partitioning

In [138]:
df_train.head()

Unnamed: 0,startYear,runtimeMinutes,awardWins,numVotes,totalImages,totalVideos,totalCredits,AwNmExWins,numRegions,ratingMean,ReviewsTotal,grouped_titleType
0,1995,40,0.0,6.659294,0.693147,0.0,21,0.0,1,7.5,0.693147,tvseries_related
1,1982,28,0.0,2.484907,0.693147,0.0,1,0.0,1,5.5,0.0,videoGame
2,1992,92,0.0,3.663562,0.693147,0.0,24,0.0,1,5.5,0.693147,movie
3,2021,53,0.0,2.772589,3.135494,0.0,56,0.0,1,5.5,0.0,tvseries_related
4,2020,80,0.693147,9.662625,5.852202,1.098612,47,0.693147,32,7.5,6.124683,tvseries_related


In [139]:
from sklearn.model_selection import train_test_split

In [140]:
X = df_train.drop(columns=['grouped_titleType'])
y = df_train['grouped_titleType']

In [141]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
# with stratify
print(np.unique(y, return_counts=True)[1] / len(y))
print(np.unique(y_train, return_counts=True)[1] / len(y_train))
print(np.unique(y_test, return_counts=True)[1] / len(y_test))

[0.3855573  0.15510204 0.00929356 0.01312402 0.37513344 0.04803768
 0.01375196]
[0.3855573  0.15510204 0.0093145  0.01308216 0.37519623 0.04803768
 0.0137101 ]
[0.3855573  0.15510204 0.00926217 0.01318681 0.37503925 0.04803768
 0.01381476]


### Normalization

In [142]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [143]:
norm = MinMaxScaler()
norm.fit(X_train)

X_train_norm = norm.transform(X_train)
X_test_norm = norm.transform(X_test)

## Nayve Bayesian

In [144]:
from sklearn.naive_bayes import GaussianNB, CategoricalNB

In [145]:
clf = GaussianNB()

In [146]:
%%time
clf.fit(X_train, y_train)

CPU times: total: 0 ns
Wall time: 9.28 ms


In [147]:
y_pred = clf.predict(X_test)
y_pred

array(['tvseries_related', 'tvseries_related', 'short', ..., 'video',
       'movie', 'tvseries_related'], dtype='<U16')

In [148]:
print(classification_report(y_test, y_pred))

                  precision    recall  f1-score   support

           movie       0.88      0.58      0.70      2456
           short       0.70      0.85      0.76       988
         special       0.22      0.03      0.06        59
    tvMiniSeries       0.28      0.13      0.18        84
tvseries_related       0.80      0.52      0.63      2389
           video       0.11      0.69      0.19       306
       videoGame       0.71      0.28      0.41        88

        accuracy                           0.59      6370
       macro avg       0.53      0.44      0.42      6370
    weighted avg       0.77      0.59      0.65      6370



In [149]:
clf.predict_proba(X_test)

array([[1.55767840e-001, 7.10980655e-014, 2.35569084e-002, ...,
        4.59857468e-001, 3.54843031e-001, 0.00000000e+000],
       [3.27581678e-003, 2.07611668e-006, 1.61074165e-002, ...,
        5.57515155e-001, 4.20564409e-001, 8.53886936e-125],
       [3.36839280e-004, 9.88175432e-001, 2.52792348e-004, ...,
        1.09094235e-002, 2.63133195e-004, 2.20211167e-067],
       ...,
       [3.78243355e-001, 1.23260475e-025, 6.19081538e-002, ...,
        8.61397974e-003, 5.43549666e-001, 0.00000000e+000],
       [9.89569471e-001, 4.56326643e-055, 5.41855724e-009, ...,
        1.43535696e-003, 3.74509505e-014, 0.00000000e+000],
       [1.40336230e-003, 3.53830500e-002, 1.43731387e-002, ...,
        5.57806371e-001, 3.90223925e-001, 1.63651630e-030]])

In [150]:
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
plot_roc(y_test, clf.predict_proba(X_test))
plt.show()
print(roc_auc_score(y_test, clf.predict_proba(X_test), multi_class="ovr", average="macro"))

NameError: name 'plot_roc' is not defined

### K-fold cross validation

In [151]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB

# Supponiamo di avere X e y come feature e target
clf = GaussianNB()

# Esegui la cross-validation k-fold con k=5
scores = cross_val_score(clf, X, y, cv=100)

print("Cross-Validation Scores: ", scores)
print("Mean Accuracy: ", scores.mean())

Cross-Validation Scores:  [0.6375     0.65       0.60625    0.65       0.6375     0.575
 0.6375     0.55       0.60625    0.5875     0.5625     0.58125
 0.64375    0.625      0.64375    0.60625    0.625      0.61875
 0.5875     0.61875    0.625      0.65625    0.6375     0.6125
 0.56875    0.61006289 0.62893082 0.61006289 0.57232704 0.57861635
 0.57861635 0.53459119 0.57861635 0.57861635 0.66037736 0.64150943
 0.62893082 0.57232704 0.55345912 0.60377358 0.53459119 0.54716981
 0.62893082 0.56603774 0.59748428 0.57232704 0.57232704 0.64150943
 0.64150943 0.62264151 0.56603774 0.64150943 0.52201258 0.56603774
 0.57861635 0.6163522  0.61006289 0.60377358 0.65408805 0.62264151
 0.57861635 0.52201258 0.63522013 0.60377358 0.63522013 0.66666667
 0.6163522  0.5408805  0.55974843 0.63522013 0.55974843 0.60377358
 0.66666667 0.63522013 0.60377358 0.61006289 0.57861635 0.57861635
 0.61006289 0.60377358 0.60377358 0.65408805 0.6163522  0.57232704
 0.62893082 0.63522013 0.58490566 0.63522013 0.6226

### Evaluation

In [152]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Supponiamo di avere X e y come feature e target del dataset di addestramento
X_train = X
y_train = y

# Caricare il dataset di test
df_test = pd.read_csv('test_log.csv')

In [153]:
mapping = {
    'short': 'short',
    'tvShort': 'short',
    'movie': 'movie',
    'tvMovie': 'movie',
    'tvSpecial': 'special',
    'videogame': 'videogame',
    'video': 'video',
    'tvMiniseries': 'tvseries_related',
    'tvSeries': 'tvseries_related',
    'tvEpisode': 'tvseries_related'
}

df_test['grouped_titleType'] = df_test['titleType'].replace(mapping)

In [154]:
del df_test['originalTitle']
del df_test['rating']
del df_test['genres']
del df_test['countryOfOrigin']
del df_test['isAdult']
del df_test['canHaveEpisodes']
del df_test['titleType'] 

In [155]:
X_test = df_test.drop('grouped_titleType', axis=1)
y_test = df_test['grouped_titleType']

# Inizializzare e addestrare il classificatore Naive Bayes sull'intero dataset di addestramento
clf = GaussianNB()
clf.fit(X_train, y_train)

# Fare previsioni sui dati di test
y_pred = clf.predict(X_test)

# Valutare la performance del modello sui dati di test
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Report di classificazione
report = classification_report(y_test, y_pred)
print(report)

# Matrice di confusione
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

Accuracy: 0.5937324075811596
                  precision    recall  f1-score   support

           movie       0.86      0.57      0.68      2094
           short       0.69      0.85      0.77       770
         special       0.06      0.07      0.06        46
    tvMiniSeries       0.24      0.14      0.18        78
tvseries_related       0.81      0.56      0.66      2016
           video       0.11      0.68      0.19       242
       videoGame       0.00      0.00      0.00        83

        accuracy                           0.59      5329
       macro avg       0.40      0.41      0.36      5329
    weighted avg       0.75      0.59      0.64      5329

[[1194    7   33   12  185  663    0]
 [  26  658    0    0   56   27    3]
 [  10    1    3    4    1   27    0]
 [  15    3    3   11   12   34    0]
 [ 111  241    0    9 1134  519    2]
 [  23   38    3    1   13  164    0]
 [  14    0    5    8    0   56    0]]


#### Categorical

In [1]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import classification_report

# Caricare il dataset
df_train = pd.read_csv('train_log.csv')

# Mappare i tipi di titolo
mapping = {
    'short': 'short',
    'tvShort': 'short',
    'movie': 'movie',
    'tvMovie': 'movie',
    'tvSpecial': 'special',
    'videogame': 'videogame',
    'video': 'video',
    'tvMiniseries': 'tvseries_related',
    'tvSeries': 'tvseries_related',
    'tvEpisode': 'tvseries_related'
}

df_train['grouped_titleType'] = df_train['titleType'].replace(mapping)

# Trasformare la variabile awardWins in vero o falso se >0
df_train['awardWins_bol'] = df_train['awardWins'] > 0

# Convertire countryOfOrigin in stringhe
df_train['countryOfOrigin'] = df_train['countryOfOrigin'].apply(lambda x: str(x))

# Aggiungere una categoria "sconosciuta" ai dati di addestramento
for column in ['rating', 'countryOfOrigin', 'genres']:
    df_train[column] = df_train[column].astype(str) + '_known'

# Codificare le variabili categoriche
label_encoders = {}
categorical_columns = ['rating', 'countryOfOrigin', 'genres', 'canHaveEpisodes', 'isAdult', 'awardWins_bol']

for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    df_train[column] = label_encoders[column].fit_transform(df_train[column].astype(str))

# Codificare la variabile target
label_encoders['grouped_titleType'] = LabelEncoder()
df_train['grouped_titleType'] = label_encoders['grouped_titleType'].fit_transform(df_train['grouped_titleType'])

# Definire le caratteristiche e la variabile target
X = df_train[categorical_columns]
y = df_train['grouped_titleType']

# Inizializzare il classificatore Naive Bayes
model = CategoricalNB()

# Eseguire il k-fold cross-validation
kf = KFold(n_splits=100, shuffle=True, random_state=1)
scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

# Calcolare l'accuratezza media e stampare i risultati
print(f"Cross-Validation Scores: {scores}")
print(f"Mean Accuracy: {scores.mean()}")

# Addestrare il modello sull'intero dataset
model.fit(X, y)

# Fare previsioni
y_pred = model.predict(X)

# Creare il report di classificazione
report = classification_report(y, y_pred, target_names=label_encoders['grouped_titleType'].classes_)
print(report)

Traceback (most recent call last):
  File "c:\Users\Lavigi\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\metrics\_scorer.py", line 139, in __call__
    score = scorer._score(
        cached_call, estimator, *args, **routed_params.get(name).score
    )
  File "c:\Users\Lavigi\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\metrics\_scorer.py", line 371, in _score
    y_pred = method_caller(
        estimator, response_method.__name__, X, pos_label=pos_label
    )
  File "c:\Users\Lavigi\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\metrics\_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ~~~~~~~~~~~~~~~~~~~~^
        estimator, *args, response_method=response_method, **kwargs
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\Lavigi\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_response.py", line 211, in _get_respon

Cross-Validation Scores: [0.85       0.775      0.85625    0.78125    0.83125    0.80625
 0.83125    0.8125     0.80625    0.8125     0.775      0.83125
 0.8125     0.8        0.80625    0.81875    0.81875    0.83125
 0.8125     0.79375    0.8        0.84375    0.85       0.8125
 0.8125     0.7672956  0.79245283 0.81761006 0.79245283 0.78616352
 0.76100629 0.89308176 0.77358491 0.7672956  0.7672956  0.81761006
 0.78616352 0.85534591 0.82389937 0.77987421 0.8490566  0.7672956
 0.76100629 0.81761006 0.79874214 0.81761006 0.78616352 0.77358491
 0.81132075 0.78616352 0.79874214 0.8427673  0.8490566  0.81761006
 0.78616352 0.75471698 0.81132075 0.7672956  0.81761006 0.85534591
 0.77358491 0.79245283 0.82389937        nan 0.81132075 0.82389937
 0.78616352 0.77358491 0.85534591 0.81132075 0.8427673  0.75471698
 0.79874214 0.81761006 0.80503145 0.7672956  0.74842767 0.85534591
 0.81132075 0.78616352 0.79245283 0.78616352 0.81132075 0.79874214
 0.81132075 0.77358491 0.85534591 0.81132075 0.7861

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Caricare il dataset di addestramento
df_train = pd.read_csv('train_log.csv')

# Mappare i tipi di titolo
mapping = {
    'short': 'short',
    'tvShort': 'short',
    'movie': 'movie',
    'tvMovie': 'movie',
    'tvSpecial': 'special',
    'videogame': 'videogame',
    'video': 'video',
    'tvMiniseries': 'tvseries_related',
    'tvSeries': 'tvseries_related',
    'tvEpisode': 'tvseries_related'
}

df_train['grouped_titleType'] = df_train['titleType'].replace(mapping)

# Trasformare la variabile awardWins in vero o falso se >0
df_train['awardWins_bol'] = df_train['awardWins'] > 0

# Convertire countryOfOrigin in stringhe
df_train['countryOfOrigin'] = df_train['countryOfOrigin'].apply(lambda x: str(x))

# Codificare le variabili categoriche
label_encoders = {}
categorical_columns = ['rating', 'countryOfOrigin', 'genres', 'canHaveEpisodes', 'isAdult', 'awardWins_bol']

for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    df_train[column] = label_encoders[column].fit_transform(df_train[column].astype(str))

# Codificare la variabile target
label_encoders['grouped_titleType'] = LabelEncoder()
df_train['grouped_titleType'] = label_encoders['grouped_titleType'].fit_transform(df_train['grouped_titleType'])

# Definire le caratteristiche e la variabile target
X = df_train[categorical_columns]
y = df_train['grouped_titleType']

# Dividere i dati in set di addestramento e test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inizializzare e addestrare il classificatore Naive Bayes
model = CategoricalNB()
model.fit(X_train, y_train)

# Fare previsioni sul set di test
y_pred = model.predict(X_test)

# Calcolare l'accuratezza
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Report di classificazione
report = classification_report(y_test, y_pred, target_names=label_encoders['grouped_titleType'].classes_)
print(report)

# Matrice di confusione
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

# Caricare il dataset di test separato
df_test = pd.read_csv('test_log.csv')

# Applicare le stesse trasformazioni al dataset di test
df_test['grouped_titleType'] = df_test['titleType'].replace(mapping)
df_test['awardWins_bol'] = df_test['awardWins'] > 0
df_test['countryOfOrigin'] = df_test['countryOfOrigin'].apply(lambda x: str(x))

# Gestire le categorie sconosciute nei dati di test
for column in ['rating', 'countryOfOrigin', 'genres']:
    df_test[column] = df_test[column].apply(lambda x: x if x in label_encoders[column].classes_ else label_encoders[column].classes_[0])

# Codificare le variabili categoriche nel dataset di test
for column in categorical_columns:
    df_test[column] = label_encoders[column].transform(df_test[column].astype(str))

df_test['grouped_titleType'] = label_encoders['grouped_titleType'].transform(df_test['grouped_titleType'])

# Definire le caratteristiche e la variabile target per il dataset di test
X_test_finale = df_test[categorical_columns]
y_test_finale = df_test['grouped_titleType']

# Fare previsioni sul dataset di test separato
y_pred_finale = model.predict(X_test_finale)

# Calcolare l'accuratezza sul dataset di test separato
accuracy_finale = accuracy_score(y_test_finale, y_pred_finale)
print(f"Final Test Accuracy: {accuracy_finale}")

# Report di classificazione sul dataset di test separato
report_finale = classification_report(y_test_finale, y_pred_finale, target_names=label_encoders['grouped_titleType'].classes_)
print(report_finale)

# Matrice di confusione sul dataset di test separato
conf_matrix_finale = confusion_matrix(y_test_finale, y_pred_finale)
print(conf_matrix_finale)

Accuracy: 0.8122448979591836
                  precision    recall  f1-score   support

           movie       0.83      0.84      0.84      1216
           short       0.88      0.90      0.89       477
         special       1.00      0.06      0.12        31
    tvMiniSeries       0.35      0.15      0.21        41
tvseries_related       0.79      0.88      0.83      1214
           video       0.69      0.38      0.49       159
       videoGame       0.33      0.02      0.04        47

        accuracy                           0.81      3185
       macro avg       0.70      0.46      0.49      3185
    weighted avg       0.80      0.81      0.80      3185

[[1023   11    0    0  157   25    0]
 [  19  431    0    0   27    0    0]
 [  11    1    2    0   17    0    0]
 [   0    0    0    6   35    0    0]
 [ 126   10    0   11 1063    3    1]
 [  32   39    0    0   26   61    1]
 [  20    0    0    0   26    0    1]]
Final Test Accuracy: 0.811033965096641
                  precis