# Modeling Notebook

Welcome to the Modeling Notebook. This notebook focuses on building and evaluating machine learning models based on the insights gained from EDA.

## Objectives:
- **Model Selection**: Trying out different models to see which one performs the best.
- **Model Tuning**: Optimizing the performance of the selected model.
- **Model Evaluation**: Assessing the performance of the model using appropriate metrics.

## Dataset:
In this notebook, we will be working with the cleaned dataset located at `./data/features/movies_dataset.parquet`, which is the result of feature engineering process in the preceding EDA Notebook.


In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet('dataset.parquet')
df.dtypes

release_year                   object
release_month                  object
runtime                         int64
rated                          object
collection                      int64
is_english                      int64
title_vector                  float64
overview_vector               float64
tagline_vector                float64
plot_vector                   float64
overview_sentiment            float64
title_sentiment               float64
tagline_sentiment             float64
plot_sentiment                float64
era                          category
is_summer                       int64
is_autumn                       int64
action                          int64
adventure                       int64
animation                       int64
comedy                          int64
crime                           int64
drama                           int64
family                          int64
fantasy                         int64
history                         int64
horror      

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
labels = ['numerical_ROI_category', 'numerical_rating_category', 'numerical_award_category']

features = ['release_year', 'release_month', 'runtime', 'rated', 'collection', 
                 'is_english', 'title_vector',
                'overview_vector', 'tagline_vector', 'plot_vector',
                'overview_sentiment', 'title_sentiment', 'tagline_sentiment',
                'plot_sentiment', 'era', 'is_summer', 'is_autumn',
                'action', 'adventure', 'animation', 'comedy', 'crime', 'drama',
                'family', 'fantasy', 'history', 'horror', 'music', 'mystery', 'romance',
                'science fiction', 'thriller', 'war', 'western', 'num_spoken_languages',
                'num_genres', 'num_production_companies', 'num_production_countries',
                'is_foreign', 'director_popularity', 'writer_popularity',
                'producer_popularity', 'average_crew_popularity', 'number_crew_members',
                'average_cast_popularity', 'number_cast_members',
                'top_cast_popularity']




### LEARNING

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
labels = ['numerical_ROI_category', 'numerical_rating_category', 'numerical_award_category']

features = ['release_year', 'release_month', 'runtime', 'rated', 'collection', 
                 'is_english', 'title_vector',
                'overview_vector', 'tagline_vector', 'plot_vector',
                'overview_sentiment', 'title_sentiment', 'tagline_sentiment',
                'plot_sentiment', 'era', 'is_summer', 'is_autumn',
                'action', 'adventure', 'animation', 'comedy', 'crime', 'drama',
                'family', 'fantasy', 'history', 'horror', 'music', 'mystery', 'romance',
                'science fiction', 'thriller', 'war', 'western', 'num_spoken_languages',
                'num_genres', 'num_production_companies', 'num_production_countries',
                'is_foreign', 'director_popularity', 'writer_popularity',
                'producer_popularity', 'average_crew_popularity', 'number_crew_members',
                'average_cast_popularity', 'number_cast_members',
                'top_cast_popularity']




#### LogisticRegression

In [None]:
for label in labels:
    print(f"Training for {label}")
    
    # Logistic Regression
    lr = LogisticRegression(max_iter=1000, random_state=42)
    lr.fit(X_train_t, y_train[label])
    y_pred = lr.predict(X_val_t)
    print(f"Logistic Regression Accuracy: {accuracy_score(y_val[label], y_pred)}")


    lr_cv_score = cross_val_score(lr, X_train_t, y_train[label], cv=5).mean()
    print(f"Logistic Regression Mean CV Score: {lr_cv_score}")

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Define the parameter grid
param_grid = {
    'estimator__C': [0.1, 1, 10],
    'estimator__solver': ['liblinear', 'newton-cg', 'lbfgs']
}

for label in labels:
    estimator = OneVsRestClassifier(LogisticRegression(max_iter=1000))
    grid_search = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy')

    grid_search.fit(X_train_t, y_train[label])

    # Get the best parameters
    best_params = grid_search.best_params_
    print(f'Best parameters: {best_params}')

    best_score = grid_search.best_score_
    print(f'Best cross-validation score: {best_score}')

    val_score = grid_search.score(X_val_t, y_val[label])
    print(f'Validation score: {val_score}')


In [None]:
for label in labels:

    print(label)

    # Train a model using all features
    model = LogisticRegression(solver='liblinear', C=1, max_iter=1000, random_state=42)
    model.fit(X_train_t, y_train[label])
    original_accuracy = accuracy_score(y_val[label], model.predict(X_val_t))
    print("originalaccuracy:", original_accuracy)

    differences = {}

    for feature in features:
        # Exclude the feature
        cols_to_use = [col for col in X_train.columns if col != feature]
        
        train_dict = X_train[cols_to_use].to_dict(orient='records')
        val_dict = X_val[cols_to_use].to_dict(orient='records')
        
        dv = DictVectorizer(sparse=False)
        X_train_sub = dv.fit_transform(train_dict)
        X_val_sub = dv.transform(val_dict)

        scaler = StandardScaler()
        X_train_sub = scaler.fit_transform(X_train_sub)
        X_val_sub = scaler.transform(X_val_sub)
        
        # Train a model without the feature
        model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
        model.fit(X_train_sub, y_train[label])
        
        accuracy_without_feature = accuracy_score(y_val[label], model.predict(X_val_sub))
        differences[feature] = abs(original_accuracy - accuracy_without_feature)

    # Convert the dictionary to a DataFrame
    acc_df = pd.DataFrame(list(differences.items()), columns=['Feature', 'original_accuracy - accuracy_without_feature'])
    acc_df = acc_df.sort_values(by='original_accuracy - accuracy_without_feature', ascending=False)

    # Plotting
    plt.figure(figsize=(10, 12))
    sns.barplot(x='original_accuracy - accuracy_without_feature', y='Feature', data=acc_df)
    plt.title('original_accuracy - accuracy_without_feature')
    plt.xlabel('accuracy diff')
    plt.ylabel('Feature')
    plt.show()


#### RandomForestClassifier

In [None]:
for label in labels:
    print(f"Training for {label}")
    
    # Random Forest
    rf = RandomForestClassifier(random_state=42, n_estimators=10, max_depth=20,)
    rf.fit(X_train_t, y_train[label])
    rf_pred = rf.predict(X_val_t)
    print(f"Random Forest Accuracy: {accuracy_score(y_val[label], rf_pred)}")
    rf_cv_score = cross_val_score(rf, X_train_t, y_train[label], cv=5).mean()
    print(f"Random Forest Mean CV Score: {rf_cv_score}")
    print("\n")  

In [None]:
for label in labels:
    print(f"Training for {label}")
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Create a RandomForestClassifier object
    rf = RandomForestClassifier(random_state=42)

    # Create a GridSearchCV object
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

    # Fit the GridSearchCV object to the data
    grid_search.fit(X_train_t, y_train[label])

    # Get the best parameters
    best_params = grid_search.best_params_
    print(best_params)

#### MultiOutputClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report

# Create the Random Forest object
rf = RandomForestClassifier(random_state=1)

# Wrap it with MultiOutputClassifier
multi_target_forest = MultiOutputClassifier(rf, n_jobs=-1)

multi_target_forest.fit(X_train, y_train)

cv_scores = cross_val_score(multi_target_forest, X_train, y_train, cv=5)
print(f'Mean CV Score: {cv_scores.mean()}')

y_pred = multi_target_forest.predict(X_val)

print('Accuracy Score:', accuracy_score(y_val, y_pred))
print('Classification Report:\n', classification_report(y_val, y_pred))


In [None]:
features_ROI = ['release_month', 'rated', 'collection', 
                  
                'overview_vector',  'tagline_vector', 'plot_vector',
                'overview_sentiment', 
                 'plot_sentiment', 'era', 
                'action', 'adventure', 'animation', 'comedy', 'crime', 'drama',
                'family', 'fantasy', 'history', 'horror', 'music', 'mystery', 'romance',
                'science fiction', 'thriller', 'war', 'western', 
                'num_production_companies', 
                'producer_popularity',  'director_popularity', 'writer_popularity',
                'number_crew_members',
                'average_cast_popularity', 
                'numerical_ROI_category', 'numerical_rating_category', 'numerical_award_category']

dataset_df = df[features_ROI].copy()

from sklearn.model_selection import train_test_split, train_test_split
df_full_train, df_test = train_test_split(dataset_df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)


X_train = df_train.reset_index(drop=True)
X_val = df_val.reset_index(drop=True)
X_test = df_test.reset_index(drop=True)

y_train = df_train.numerical_ROI_category
y_val = df_val.numerical_ROI_category
y_test = df_test.numerical_ROI_category

X_train.drop(['numerical_ROI_category', 'numerical_rating_category', 'numerical_award_category'], axis=1, inplace=True)
X_val.drop(['numerical_ROI_category', 'numerical_rating_category', 'numerical_award_category'], axis=1, inplace=True)
X_test.drop(['numerical_ROI_category', 'numerical_rating_category', 'numerical_award_category'], axis=1, inplace=True)

from sklearn.preprocessing import StandardScaler

# Now use X_train_scaled and X_val_scaled instead of X_train and X_val in your loop.
dv = DictVectorizer(sparse=False)
X_train_t = dv.fit_transform(X_train.to_dict(orient='records'))
X_val_t = dv.transform(X_val.to_dict(orient='records'))

# Scale the data
scaler = StandardScaler()
X_train_t = scaler.fit_transform(X_train_t)
X_val_t = scaler.transform(X_val_t)


# Train a model using all features
label = 'numerical_ROI_category'
model = LogisticRegression(solver='liblinear', C=1, max_iter=1000, random_state=42)
model.fit(X_train_t, y_train)
original_accuracy = accuracy_score(y_val, model.predict(X_val_t))
print("accuracy:", original_accuracy)



In [None]:
features_ROI = ['release_month', 'release_year', 'rated', 'collection', 
                  
                'overview_vector',  'tagline_vector', 'plot_vector',
                'overview_sentiment', 
                 'plot_sentiment', 'era', 
                'action', 'adventure', 'animation', 'comedy', 'crime', 'drama',
                'family', 'fantasy', 'history', 'horror', 'music', 'mystery', 'romance',
                'science fiction', 'thriller', 'war', 'western', 
                'num_production_companies', 
                'producer_popularity',  'director_popularity', 'writer_popularity',
                'number_crew_members',
                'average_cast_popularity', 
                'numerical_ROI_category', 'numerical_rating_category', 'numerical_award_category']

dataset_df = df[features_ROI].copy()

from sklearn.model_selection import train_test_split, train_test_split
df_full_train, df_test = train_test_split(dataset_df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)


X_train = df_train.reset_index(drop=True)
X_val = df_val.reset_index(drop=True)
X_test = df_test.reset_index(drop=True)

y_train = df_train.numerical_ROI_category
y_val = df_val.numerical_ROI_category
y_test = df_test.numerical_ROI_category

X_train.drop(['numerical_ROI_category', 'numerical_rating_category', 'numerical_award_category'], axis=1, inplace=True)
X_val.drop(['numerical_ROI_category', 'numerical_rating_category', 'numerical_award_category'], axis=1, inplace=True)
X_test.drop(['numerical_ROI_category', 'numerical_rating_category', 'numerical_award_category'], axis=1, inplace=True)

from sklearn.preprocessing import StandardScaler

# Now use X_train_scaled and X_val_scaled instead of X_train and X_val in your loop.
dv = DictVectorizer(sparse=False)
X_train_t = dv.fit_transform(X_train.to_dict(orient='records'))
X_val_t = dv.transform(X_val.to_dict(orient='records'))

# Scale the data
scaler = StandardScaler()
X_train_t = scaler.fit_transform(X_train_t)
X_val_t = scaler.transform(X_val_t)


# Train a model using all features
label = 'numerical_ROI_category'
model = LogisticRegression(solver='liblinear', C=1, max_iter=1000, random_state=42)
model.fit(X_train_t, y_train)
original_accuracy = accuracy_score(y_val, model.predict(X_val_t))
print("accuracy:", original_accuracy)



In [None]:
# Create the parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs'],
    #'max_iter': [100, 500, 1000]
}
#estimator = OneVsRestClassifier(LogisticRegression(max_iter=1000))
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train_t, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')

best_score = grid_search.best_score_
print(f'Best cross-validation score: {best_score}')

val_score = grid_search.score(X_val_t, y_val)
print(f'Validation score: {val_score}')


In [None]:
from sklearn.linear_model import LassoCV
lasso = LassoCV(cv=5)
lasso.fit(X_train_t, y_train)
lasso_coef = lasso.coef_
lasso_coef

import matplotlib.pyplot as plt
import numpy as np

# Create an array of feature names
feature_names = np.array(dv.get_feature_names_out())

# Plotting
plt.figure(figsize=(12,12))
plt.barh(feature_names, lasso_coef)
plt.title('Lasso coefficients')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature Name')
plt.show()

# Identify and print the features with a coefficient of zero
zero_features = feature_names[lasso_coef == 0]
print('Features with zero coefficient:', zero_features)
