In [2]:
## import tools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB 
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import ComplementNB

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import RocCurveDisplay

from sklearn.model_selection import train_test_split

In [3]:
my_data = pd.read_csv("data/train.csv")

## Data exploration
my_data = my_data.drop("title_index", axis = 1)

In [4]:
%%capture
my_data["country"] = my_data["country"].fillna("Unknown")
my_data["language"] = my_data["language"].fillna("Unknown")
my_data["director"].fillna("Unknown", inplace = True)
my_data["writer"].fillna("Unknown", inplace = True)
my_data["production_company"].fillna("Unknown", inplace = True)
my_data["actors"].fillna("Unknown", inplace = True)
my_data["description"].fillna("Unknown", inplace = True)

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
column_transformer = ColumnTransformer(
    transformers = [
        ('description_tranform', CountVectorizer(stop_words = 'english'), 'description')
    ],
    remainder = 'drop'
)
transformed_data = column_transformer.fit_transform(my_data)

In [6]:
count_vec = column_transformer.named_transformers_["description_tranform"]
feature_names = count_vec.get_feature_names_out()
prefixed_feature_names = [feature for feature in feature_names]

sparse_mat = pd.DataFrame.sparse.from_spmatrix(transformed_data, columns = prefixed_feature_names)
cols = []
for col in my_data.columns:
    cols.append(col)
sparse_mat.index = my_data.index

In [7]:
my_data["popularity"] = (my_data["popularity"] == "popular").astype(int)

In [8]:
X = sparse_mat
y = my_data["popularity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2002)

In [9]:
models = {"Logistic Regression": LogisticRegression(verbose=1, max_iter = 1000),
           "Multinomial NB": MultinomialNB(),
             "BernoulliNB": BernoulliNB(),
             "ComplementNB": ComplementNB()}
        
def fit_and_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    model_scores = {}
    for name, model in models.items():
        if name == "GaussianNB":
            X_dense = X_train.toarray()
            model.fit(X_dense, y_train)
        else:
            model.fit(X_train, y_train)
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

In [10]:
model_score = fit_and_score(models = models, 
                            X_train= X_train, X_test = X_test, 
                            y_train = y_train, y_test = y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.1s finished


In [11]:
model_score

{'Logistic Regression': 0.622171602126044,
 'Multinomial NB': 0.6433561123766135,
 'BernoulliNB': 0.6436598329536826,
 'ComplementNB': 0.6406226271829917}

# Hyperparameter tuning

In [12]:
train_scores = []
test_scores = []

#linear regression
penalty = ['l2', 'none']
C = [2, 5, 10, 20]
solver = ["lbfgs", "sag", "saga", "newton-cg"]
class_weight = ["balanced", None]
multi_class = ["auto", "ovr", "multinomial"]
max_iter = [1000, 2000]

logis_reg = LogisticRegression(penalty = 'l2', C = 1, solver = 'saga', class_weight = None, multi_class = "auto", max_iter = 1000)
# logis_reg.fit(X_train, y_train)
# test_scores.append(logis_reg.score(X_test, y_test))

In [13]:
# Hyperparameter tuning with RandomizedSearchCV
# Create a hyperparameter grid for LogisticRegression

log_reg_grid = {"solver": ["liblinear"]}


In [14]:
np.random.seed(42)

rs_log_reg = RandomizedSearchCV(LogisticRegression(), param_distributions = log_reg_grid, cv =5, n_iter = 20, verbose = True)

# Fit random hyperparameter search model for LogisticRegression
rs_log_reg.fit(X_train, y_train)




Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [15]:
multinomial_nb = {"alpha": np.logspace(-5, 5, 5) , "force_alpha": [False], "fit_prior": [False]}

In [16]:
np.random.seed(42)
rs_log_reg = RandomizedSearchCV(ComplementNB(), param_distributions = multinomial_nb, cv = 5, n_iter = 20, verbose = True)
rs_log_reg.fit(X_train, y_train)



Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [17]:
rs_log_reg.score(X_test, y_test)

0.6406226271829917

In [19]:
clf = models["BernoulliNB"]

In [23]:
log_prob = clf.feature_log_prob_
feature_names = X_train.columns

# You can then look at the highest and lowest log probabilities
# to understand which features are most indicative of each class
print("Top indicative features for class 0:")
sorted_features_0 = sorted(zip(feature_names, log_prob[0]), key=lambda x: x[1], reverse=True)
for feature, log_prob in sorted_features_0[:10]:  # Just top 10 features
    print(f"{feature}: {log_prob}")

print("\nTop indicative features for class 1:")
sorted_features_1 = sorted(zip(feature_names, log_prob[1]), key=lambda x: x[1], reverse=True)
for feature, log_prob in sorted_features_1[:10]:  # Just top 10 features
    print(f"{feature}: {log_prob}")

Top indicative features for class 0:
young: -2.1960341516184956
man: -2.4833338879922104
life: -2.4987425133450554
woman: -2.6779632924607615
love: -2.7281166144099007
new: -2.7961333999237965
family: -2.8108756816610008
old: -3.006295375598854
story: -3.023641864322708
friends: -3.0412945733790755

Top indicative features for class 1:


IndexError: invalid index to scalar variable.