# <center><b>Movies Success Prediction</b></center>

In [2]:
# import libraries
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.feature_selection import RFE
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OneHotEncoder,RobustScaler
from category_encoders import BinaryEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, roc_curve, auc
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
import joblib
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
# load data
df = pd.read_pickle("Movies_After_EDA.pkl")
df

Unnamed: 0,rating,genre,year,votes,country,budget,gross,runtime,month,net_profit,target
0,Adults,Drama,1980,927000,United Kingdom,19000000,46998772,146,6,27998772,1
1,Adults,Adventure,1980,65000,United States,4500000,58853106,104,7,54353106,0
2,Adults Guidance,Action,1980,1200000,United States,18000000,538375067,124,6,520375067,1
3,Adults Guidance,Comedy,1980,221000,United States,3500000,83453539,88,7,79953539,1
4,Adults,Comedy,1980,108000,United States,6000000,39846344,98,7,33846344,1
...,...,...,...,...,...,...,...,...,...,...,...
7583,Adults,Crime,2020,2400,United States,20500000,3661,98,4,-20496339,0
7584,Adults Guidance,Animation,2020,1300,Other,20500000,240663149,110,10,220163149,1
7585,Adults Guidance,Comedy,2020,294,United States,20500000,413378,120,6,-20086622,0
7586,Adults,Drama,2020,35,United States,5000,20205757,78,2,20200757,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7588 entries, 0 to 7587
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   rating      7588 non-null   object
 1   genre       7588 non-null   object
 2   year        7588 non-null   int64 
 3   votes       7588 non-null   int32 
 4   country     7588 non-null   object
 5   budget      7588 non-null   int32 
 6   gross       7588 non-null   int32 
 7   runtime     7588 non-null   int32 
 8   month       7588 non-null   int32 
 9   net_profit  7588 non-null   int32 
 10  target      7588 non-null   int64 
dtypes: int32(6), int64(2), object(3)
memory usage: 474.4+ KB


In [5]:
# creating encoder for categorical features using column transformer
Encoder = ColumnTransformer(transformers = [("OHE" , OneHotEncoder(sparse = False , drop = "first" ) , ["rating" , "country"]) , ("BE" , BinaryEncoder() , ["genre"] )] , remainder = "passthrough")

In [6]:
# splitting dataframe into feature and target variables
x = df.drop(["target"],axis=1)
y = df['target']

In [7]:
# Creating a list of classification models for comparison
models = list()
models.append(("LR" , LogisticRegression()))
models.append(("CART" , DecisionTreeClassifier()))
models.append(("RF" , RandomForestClassifier()))
models.append(("KNN" , KNeighborsClassifier()))
models.append(("XG" , XGBClassifier()))

In [8]:
# comparing the performance of different machine learning models using cross-validation and pipeline without feature selection
for model in models:
    steps = []
    steps.append(("Encoder" , Encoder))
    steps.append(("Scaler" , RobustScaler()))
    steps.append(model)
    pipeline = Pipeline(steps = steps)
    scores = cross_validate(pipeline , x , y , cv = 5 , scoring = "accuracy" , return_train_score = True)
    print(model[0])
    print("Train_accuracy =" , scores["train_score"].mean())
    print("Test_accuracy =" , scores["test_score"].mean())
    print("*" * 50)   

LR
Train_accuracy = 0.7515154363643617
Test_accuracy = 0.7394543005359548
**************************************************
CART
Train_accuracy = 1.0
Test_accuracy = 0.6226910994673455
**************************************************
RF
Train_accuracy = 1.0
Test_accuracy = 0.6614323568724416
**************************************************
KNN
Train_accuracy = 0.8254480085598832
Test_accuracy = 0.7233814746009868
**************************************************
XG
Train_accuracy = 0.9689313686993856
Test_accuracy = 0.49354144465491234
**************************************************


In [9]:
# now i will apply feature selection
RFE_selector = RFE(LogisticRegression(), n_features_to_select = 9)

In [10]:
# comparing the performance of different machine learning models using cross-validation and pipeline with feature selection
for model in models:
    steps = []
    steps.append(("Encoder", Encoder))
    steps.append(("Scaler", RobustScaler()))
    steps.append(("FeatureSelection", RFE_selector))
    steps.append(model)
    pipeline = Pipeline(steps=steps)
    scores = cross_validate(pipeline, x, y, cv = 5, scoring = ["roc_auc","accuracy","f1"] , return_train_score = True)
    print(model[0])
    print("Train accuracy =", scores["train_accuracy"].mean())
    print("Test accuracy =", scores["test_accuracy"].mean())
    print("Test F1-score =", scores["test_f1"].mean())
    print ("AUC score = ", scores['test_roc_auc'].mean())
    print("*" * 50)

LR
Train accuracy = 0.7466392879210506
Test accuracy = 0.7374798398128196
Test F1-score = 0.6807621796152736
AUC score =  0.8268882569421496
**************************************************
CART
Train accuracy = 0.9960135052075969
Test accuracy = 0.6889798793298263
Test F1-score = 0.6553354968141772
AUC score =  0.6860077695933338
**************************************************
RF
Train accuracy = 0.9960135052075969
Test accuracy = 0.735633831073916
Test F1-score = 0.7031401748008472
AUC score =  0.8161093893937942
**************************************************
KNN
Train accuracy = 0.8160252498102493
Test accuracy = 0.7260121781860912
Test F1-score = 0.690782003418611
AUC score =  0.7894881649113745
**************************************************
XG
Train accuracy = 0.8768451685260932
Test accuracy = 0.7493363314148043
Test F1-score = 0.7147477636994486
AUC score =  0.8332573466383556
**************************************************


In [12]:
# defining hyper parameters for tuning
params = {
    'LR__C': [1, 10 , 100],
    'LR__penalty': ['l2'],
    'LR__solver': ['liblinear', 'lbfgs', 'saga'],
    'LR__class_weight': [None, 'balanced'],
    'LR__max_iter': [90, 100, 120]
}

In [13]:
# pipeline for classification using logistic regression, feature encoding using encoder created before, scaling using robust scaler
steps = []
steps.append(("Encoder" , Encoder))
steps.append(("Scaler" , RobustScaler()))
steps.append(("FeatureSelection", RFE_selector))
steps.append(("LR" , LogisticRegression()))
pipeline = Pipeline(steps = steps)

In [14]:
# performing grid search cross-validation
grid_search = GridSearchCV(param_grid = params , estimator = pipeline , cv = 5 , scoring = "accuracy"  , return_train_score = True , n_jobs = -1)

In [15]:
grid_search.fit(x,y)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('Encoder',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('OHE',
                                                                         OneHotEncoder(drop='first',
                                                                                       sparse=False),
                                                                         ['rating',
                                                                          'country']),
                                                                        ('BE',
                                                                         BinaryEncoder(),
                                                                         ['genre'])])),
                                       ('Scaler', RobustScaler()),
                                       ('FeatureSelection',
         

In [16]:
print("Best hyperparameters: ", grid_search.best_params_)
print("Mean train score: ", grid_search.cv_results_["mean_train_score"].mean())
print("Mean test score: ", grid_search.cv_results_["mean_test_score"].mean())

Best hyperparameters:  {'LR__C': 1, 'LR__class_weight': 'balanced', 'LR__max_iter': 90, 'LR__penalty': 'l2', 'LR__solver': 'liblinear'}
Mean train score:  0.7466820085413302
Mean test score:  0.7388864329479389


In [17]:
final_model = grid_search.best_estimator_

In [18]:
joblib.dump(final_model , "best_model.pkl")
joblib.dump(x.columns , "input_features.pkl")

['input_features.pkl']

In [19]:
x.net_profit.describe()

count    7.588000e+03
mean     4.597674e+07
std      1.369451e+08
min     -1.580311e+08
25%     -1.257080e+07
50%      1.972180e+06
75%      4.277896e+07
max      1.947484e+09
Name: net_profit, dtype: float64

In [20]:
%%writefile streamlit_app.py    

# importing libraries
import streamlit as st
import pandas as pd 
import joblib
import sklearn
import xgboost
import category_encoders

# loading saved input features and model
Inputs = joblib.load("input_features.pkl")
Model = joblib.load("best_model.pkl")

# function to make predictions
def prediction(rating, genre, year, month, votes, country, budget, gross, runtime, net_profit):
    # creating a test dataframe with the required columns
    test_df = pd.DataFrame(columns = Inputs)
    # assigning input values to the respective columns
    test_df.at[0 , "rating"] = rating
    test_df.at[0 , "genre"] = ", ".join(genre)
    test_df.at[0 , "year"] = year
    test_df.at[0 , "month"] = month
    test_df.at[0 , "votes"] = votes
    test_df.at[0 , "country"] = country
    test_df.at[0 , "budget"] = budget
    test_df.at[0 , "gross"] = gross
    test_df.at[0 , "runtime"] = runtime
    test_df.at[0 , "net_profit"] = net_profit
    # making prediction using the loaded model
    result = Model.predict(test_df)[0]
    return result

# main function to create the Streamlit app
def main():
    # setting the app title
    st.title("Movie Success Predictor")
    # creating input widgets for user input
    rating = st.radio("Rating" , ['Adults', 'Adults Guidance', 'All Audiance', 'Not Rated'])
    genre = st.multiselect("Genre" , ['Drama', 'Adventure', 'Action', 'Comedy', 'Horror', 'Biography',
       'Crime', 'Other', 'Animation'])
    votes = st.slider("Votes" , min_value = 0 , max_value = 2400000 , value = 0 ,step = 100)
    year = st.slider("Year" , min_value = 1980 , max_value = 2020 , value = 0 ,step = 1)
    month = st.slider("Month" , min_value = 1 , max_value = 12 , value = 0 ,step = 1)
    country = st.selectbox("Country" ,['United Kingdom', 'United States', 'Other', 'Canada', 'France',
       'Germany'] )
    budget = st.slider( "Budget" , min_value = 3000 , max_value = 356000000 , value = 0 , step = 1000)
    gross = st.slider( "Gross" , min_value = 309 , max_value = 356000000 , value = 0 , step = 1000)
    runtime = st.slider( "Runtime" , min_value = 63 , max_value = 366 , value = 0 , step = 5)
    net_profit = st.slider( "Net Profit" , min_value = -158031100 , max_value = 1947484000  , value = -159031100 , step = 10000)
    # predict button
    if st.button("Predict"):
        # calling the prediction function with user inputs
        results = prediction(rating, genre, year, month,  votes, country, budget, gross, runtime, net_profit)
        label = ["Unsuccessful" , "Successful"]
        st.text(f"The Movie will be {label[results]}.")
        
if __name__ == '__main__':
    main()    
    

Overwriting streamlit_app.py


In [None]:
!streamlit run streamlit_app.py