In [1]:
# import libraries
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_selection import RFE
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OneHotEncoder,RobustScaler
from category_encoders import BinaryEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import joblib
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
# load data
df = pd.read_pickle("Movies_After_EDA.pkl")
df

Unnamed: 0,rating,genre,year,votes,country,budget,gross,runtime,month,net_profit,target
0,Adults,Drama,1980,927000.0,United Kingdom,19000000.0,46998772.0,146.0,6.0,27998772.0,1
1,Adults,Adventure,1980,65000.0,United States,4500000.0,58853106.0,104.0,7.0,54353106.0,0
2,Adults Guidance,Action,1980,1200000.0,United States,18000000.0,538375067.0,124.0,6.0,520375067.0,1
3,Adults Guidance,Comedy,1980,221000.0,United States,3500000.0,83453539.0,88.0,7.0,79953539.0,1
4,Adults,Comedy,1980,108000.0,United States,6000000.0,39846344.0,98.0,7.0,33846344.0,1
...,...,...,...,...,...,...,...,...,...,...,...
7582,Adults,Crime,2020,2400.0,United States,20500000.0,3661.0,98.0,4.0,-20496339.0,0
7583,Adults Guidance,Animation,2020,1300.0,Other,20500000.0,240663149.0,110.0,10.0,220163149.0,1
7584,Adults Guidance,Comedy,2020,294.0,United States,20500000.0,413378.0,120.0,6.0,-20086622.0,0
7585,Adults,Drama,2020,35.0,United States,5000.0,20205757.0,78.0,2.0,20200757.0,1


In [3]:
df.isnull().sum()

rating        0
genre         0
year          0
votes         0
country       0
budget        0
gross         0
runtime       0
month         0
net_profit    0
target        0
dtype: int64

In [4]:
# creating encoder for categorical features using column transformer
Encoder = ColumnTransformer(transformers = [("OHE" , OneHotEncoder(sparse = False , drop = "first" ) , ["rating" , "country"]) , ("BE" , BinaryEncoder() , ["genre"] )] , remainder = "passthrough")

In [5]:
# splitting dataframe into feature and target variables
x = df.drop(["target"],axis=1)
y = df['target']

In [6]:
# Creating a list of classification models for comparison
models = list()
models.append(("LR" , LogisticRegression()))
models.append(("CART" , DecisionTreeClassifier()))
models.append(("RF" , RandomForestClassifier()))
models.append(("KNN" , KNeighborsClassifier()))
models.append(("XG" , XGBClassifier()))

In [7]:
# comparing the performance of different machine learning models using cross-validation and pipeline without feature selection
for model in models:
    steps = []
    steps.append(("Encoder" , Encoder))
    steps.append(("Scaler" , RobustScaler()))
    steps.append(model)
    pipeline = Pipeline(steps = steps)
    scores = cross_validate(pipeline , x , y , cv = 5 , scoring = "accuracy" , return_train_score = True)
    print(model[0])
    print("Train_accuracy =" , scores["train_score"].mean())
    print("Test_accuracy =" , scores["test_score"].mean())
    print("*" * 50)   

LR
Train_accuracy = 0.7513841454791045
Test_accuracy = 0.7399503909578142
**************************************************
CART
Train_accuracy = 1.0
Test_accuracy = 0.620131873896455
**************************************************
RF
Train_accuracy = 0.9999670510708402
Test_accuracy = 0.6648197894221225
**************************************************
KNN
Train_accuracy = 0.8254581157979229
Test_accuracy = 0.7229517380100624
**************************************************
XG
Train_accuracy = 0.9701793623738866
Test_accuracy = 0.5008450559882162
**************************************************


In [8]:
RFE_selector = RFE(LogisticRegression(), n_features_to_select = 9)

In [9]:
# comparing the performance of different machine learning models using cross-validation and pipeline with feature selection
for model in models:
    steps = []
    steps.append(("Encoder", Encoder))
    steps.append(("Scaler", RobustScaler()))
    steps.append(("FeatureSelection", RFE_selector))
    steps.append(model)
    pipeline = Pipeline(steps=steps)
    scores = cross_validate(pipeline, x, y, cv=5, scoring="accuracy", return_train_score=True)
    print(model[0])
    print("Train_accuracy =", scores["train_score"].mean())
    print("Test_accuracy =", scores["test_score"].mean())

LR
Train_accuracy = 0.746672155440333
Test_accuracy = 0.7374487473108894
CART
Train_accuracy = 0.9960129732676093
Test_accuracy = 0.685249126500452
RF
Train_accuracy = 0.9959800189093955
Test_accuracy = 0.7349418057795576
KNN
Train_accuracy = 0.8160671552272426
Test_accuracy = 0.7255856550660368
XG
Train_accuracy = 0.8759064497976727
Test_accuracy = 0.752207524211766


In [10]:
params = {
    'LR__C': [ 1, 10 , 100],
    'LR__penalty': ['l2'],
    'LR__solver': ['liblinear', 'lbfgs', 'saga'],
    'LR__class_weight': [None ,'balanced'],
    'LR__max_iter': [90, 100, 120]
}

In [11]:
# pipeline for classification using logistic regression, feature encoding using encoder created before, scaling using robust scaler
steps = []
steps.append(("Encoder" , Encoder))
steps.append(("Scaler" , RobustScaler()))
steps.append(("FeatureSelection", RFE_selector))
steps.append(("LR" , LogisticRegression()))
pipeline = Pipeline(steps = steps)

In [12]:
# performing grid search cross-validation
grid_search = GridSearchCV(param_grid = params , estimator = pipeline , cv = 5 , scoring = "accuracy" , return_train_score = True , n_jobs = -1)

In [13]:
grid_search.fit(x,y)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('Encoder',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('OHE',
                                                                         OneHotEncoder(drop='first',
                                                                                       sparse=False),
                                                                         ['rating',
                                                                          'country']),
                                                                        ('BE',
                                                                         BinaryEncoder(),
                                                                         ['genre'])])),
                                       ('Scaler', RobustScaler()),
                                       ('FeatureSelection',
         

In [14]:
print("Best hyperparameters: ", grid_search.best_params_)
print("Mean train score: ", grid_search.cv_results_["mean_train_score"].mean())
print("Mean test score: ", grid_search.cv_results_["mean_test_score"].mean())

Best hyperparameters:  {'LR__C': 1, 'LR__class_weight': 'balanced', 'LR__max_iter': 90, 'LR__penalty': 'l2', 'LR__solver': 'liblinear'}
Mean train score:  0.7466519714237515
Mean test score:  0.7388625972931381


In [15]:
final_model = grid_search.best_estimator_

In [16]:
joblib.dump(final_model , "best_model.pkl")
joblib.dump(x.columns , "input_features.pkl")

['input_features.pkl']

In [17]:
x.month.describe()

count    7587.000000
mean        6.578490
std         3.439163
min         1.000000
25%         4.000000
50%         7.000000
75%        10.000000
max        12.000000
Name: month, dtype: float64

In [18]:
%%writefile streamlit_app.py    

# importing libraries
import streamlit as st
import pandas as pd 
import joblib
import sklearn
import xgboost
import category_encoders

# loading saved input features and model
Inputs = joblib.load("input_features.pkl")
Model = joblib.load("best_model.pkl")

# function to make predictions
def prediction(rating, genre, year, month, votes, country, budget, gross, runtime):
    # creating a test dataframe with the required columns
    test_df = pd.DataFrame(columns = Inputs)
    # assigning input values to the respective columns
    test_df.at[0 , "rating"] = rating
    test_df.at[0 , "genre"] = genre
    test_df.at[0,"year"] = year
    test_df.at[0 , "month"] = month
    test_df.at[0 , "votes"] = votes
    test_df.at[0 , "country"] = country
    test_df.at[0 , "budget"] = budget
    test_df.at[0 , "gross"] = gross
    test_df.at[0 , "runtime"] = runtime
    # displaying the test dataframe
    st.dataframe(test_df)
    # making prediction using the loaded model
    result = Model.predict(test_df)[0]
    return result

# main function to create the Streamlit app
def main():
    # setting the app title
    st.title("Movie Success Predictor")
    # creating input widgets for user input
    rating = st.radio("Rating" , ['Adults', 'Adults Guidance', 'All Audiance', 'Not Rated'])
    genre = st.multiselect("Genre" , ['Drama', 'Adventure', 'Action', 'Comedy', 'Horror', 'Biography',
       'Crime', 'Other', 'Animation'])
    votes = st.slider("Votes" , min_value = 0.0 , max_value = 2400000.0 , value = 0.0 ,step = 100.0)
    year = st.slider("Year" , min_value = 1980 , max_value = 2020 , value = 0 ,step = 1)
    month = st.slider("Month" , min_value = 1 , max_value = 12 , value = 0 ,step = 1)
    country = st.selectbox("Country" ,['United Kingdom', 'United States', 'Other', 'Canada', 'France',
       'Germany'] )
    budget = st.slider( "Budget" , min_value = 3000.0 , max_value = 356000000.0 , value = 0.0 , step = 1000.0)
    gross = st.slider( "Gross" , min_value = 309.0 , max_value = 356000000.0 , value = 0.0 , step = 1000.0)
    runtime = st.slider( "Runtime" , min_value = 63.0 , max_value = 366.0 , value = 0.0 , step = 5.0)
    # predict button
    if st.button("Predict"):
        # calling the prediction function with user inputs
        results = prediction(rating, genre, year, month,  votes, country, budget, gross, runtime)
        label = ["Unsuccessful" , "Successful"]
        st.text(f"The Movie will be {label[results]}.")
        
if __name__ == '__main__':
    main()    
    

Overwriting streamlit_app.py


In [None]:
!streamlit run streamlit_app.py