In [68]:
# import libraries
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OneHotEncoder,RobustScaler
from category_encoders import BinaryEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import joblib
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("default")


In [69]:
# load data
df = pd.read_pickle("Movies_After_EDA.pkl")
df

Unnamed: 0,rating,genre,year,votes,country,budget,gross,runtime,Target
0,adults,Drama,1980,927000.0,United Kingdom,19000000.0,46998772.0,146.0,1
1,adults,Adventure,1980,65000.0,United States,4500000.0,58853106.0,104.0,0
2,adults guidance,Action,1980,1200000.0,United States,18000000.0,538375067.0,124.0,1
3,adults guidance,Comedy,1980,221000.0,United States,3500000.0,83453539.0,88.0,1
4,adults,Comedy,1980,108000.0,United States,6000000.0,39846344.0,98.0,1
...,...,...,...,...,...,...,...,...,...
7572,adults,Crime,2020,2400.0,United States,20500000.0,3661.0,98.0,0
7573,adults guidance,Animation,2020,1300.0,Other,20500000.0,240663149.0,110.0,1
7574,adults guidance,Comedy,2020,294.0,United States,20500000.0,413378.0,120.0,0
7575,adults,Drama,2020,35.0,United States,5000.0,20205757.0,78.0,1


In [70]:
# creating encoder for categorical features using column transformer
Encoder = ColumnTransformer(transformers = [("OHE" , OneHotEncoder(sparse = False , drop = "first" ) , ["rating" , "country"]) , ("BE" , BinaryEncoder() , ["genre"] )] , remainder = "passthrough")

In [71]:
# splitting dataframe into feature and target variables
x = df.drop(["Target"],axis=1)
y = df['Target']

In [72]:
x

Unnamed: 0,rating,genre,year,votes,country,budget,gross,runtime
0,adults,Drama,1980,927000.0,United Kingdom,19000000.0,46998772.0,146.0
1,adults,Adventure,1980,65000.0,United States,4500000.0,58853106.0,104.0
2,adults guidance,Action,1980,1200000.0,United States,18000000.0,538375067.0,124.0
3,adults guidance,Comedy,1980,221000.0,United States,3500000.0,83453539.0,88.0
4,adults,Comedy,1980,108000.0,United States,6000000.0,39846344.0,98.0
...,...,...,...,...,...,...,...,...
7572,adults,Crime,2020,2400.0,United States,20500000.0,3661.0,98.0
7573,adults guidance,Animation,2020,1300.0,Other,20500000.0,240663149.0,110.0
7574,adults guidance,Comedy,2020,294.0,United States,20500000.0,413378.0,120.0
7575,adults,Drama,2020,35.0,United States,5000.0,20205757.0,78.0


In [73]:
# Creating a list of classification models for comparison
models = list()
models.append(("LR" , LogisticRegression()))
models.append(("CART" , DecisionTreeClassifier()))
models.append(("RF" , RandomForestClassifier()))
models.append(("KNN" , KNeighborsClassifier()))
models.append(("XG" , XGBClassifier()))

In [74]:
# comparing the performance of different machine learning models using cross-validation and pipeline
for model in models:
    steps = []
    steps.append(("Encoder" , Encoder))
    steps.append(("Scaler" , RobustScaler()))
    steps.append(model)
    pipeline = Pipeline(steps = steps)
    scores = cross_validate(pipeline , x , y , cv = 5 , scoring = "accuracy" , return_train_score = True)
    print(model[0])
    print("Train_accuracy =" , scores["train_score"].mean())
    print("Test_accuracy =" , scores["test_score"].mean())
    print("*" * 50)   

LR
Train_accuracy = 0.751979879473456
Test_accuracy = 0.7388153643860427
**************************************************
CART
Train_accuracy = 1.0
Test_accuracy = 0.62398303682611
**************************************************
RF
Train_accuracy = 1.0
Test_accuracy = 0.6605521739509044
**************************************************


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNN
Train_accuracy = 0.8310017461863989
Test_accuracy = 0.7224528679780906
**************************************************
XG
Train_accuracy = 0.9558206131645983
Test_accuracy = 0.4894972003796686
**************************************************


In [94]:
# defining hyperparameters to tune
# params = {
#     'XG__learning_rate': [0.5, 0.6, 0.7],
#     'XG__n_estimators': [160, 170, 180],
#     'XG__reg_alpha': [0.6, 0.7, 0.8]
# }
params = {
    'LR__C': [0.01, 1, 100],
    'LR__penalty': ['l2'],
    'LR__solver': ['liblinear', 'lbfgs', 'sag', 'saga'],
    'LR__class_weight': [None, 'balanced'],
    'LR__max_iter': [100, 200, 500, 1000, 2000]


In [95]:
# pipeline for classification using XGBclassifier, feature encoding using encoder created before, scaling using robust scaler
steps = []
steps.append(("Encoder" , Encoder))
steps.append(("Scaler" , RobustScaler()))
steps.append(("LR" , LogisticRegression()))
pipeline = Pipeline(steps = steps)

In [96]:
# performing grid search cross-validation
grid_search = GridSearchCV(param_grid = params , estimator = pipeline , cv = 5 , scoring = "accuracy" , return_train_score = True , n_jobs = -1)

In [97]:
grid_search.fit(x,y)



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('Encoder',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('OHE',
                                                                         OneHotEncoder(drop='first',
                                                                                       sparse=False),
                                                                         ['rating',
                                                                          'country']),
                                                                        ('BE',
                                                                         BinaryEncoder(),
                                                                         ['genre'])])),
                                       ('Scaler', RobustScaler()),
                                       ('LR', LogisticRegression())])

In [98]:
print("Best hyperparameters: ", grid_search.best_params_)
print("Mean train score: ", grid_search.cv_results_["mean_train_score"].mean())
print("Mean test score: ", grid_search.cv_results_["mean_test_score"].mean())

Best hyperparameters:  {'LR__C': 100, 'LR__class_weight': None, 'LR__max_iter': 100, 'LR__penalty': 'l2', 'LR__solver': 'sag'}
Mean train score:  0.7508071885299413
Mean test score:  0.7361470061913844


In [99]:
final_model = grid_search.best_estimator_

In [100]:
joblib.dump(final_model , "best_model.pkl")
joblib.dump(x.columns , "input_features.pkl")

['input_features.pkl']

In [114]:
df['runtime'].unique().max()

366.0

In [115]:
%%writefile streamlit_app.py

# importing libraries
import streamlit as st
import pandas as pd 
import joblib
import sklearn
import xgboost
import category_encoders

# loading saved input features and model
Inputs = joblib.load("input_features.pkl")
Model = joblib.load("best_model.pkl")

# function to make predictions
def prediction(rating, genre, year, votes, country, budget, gross, runtime):
    # creating a test dataframe with the required columns
    test_df = pd.DataFrame(columns = Inputs)
    # assigning input values to the respective columns
    test_df.at[0 , "rating"] = rating
    test_df.at[0 , "genre"] = genre
    test_df.at[0,"year"] = year
    # test_df.at[0 , "score"] = score
    test_df.at[0 , "votes"] = votes
    test_df.at[0 , "country"] = country
    test_df.at[0 , "budget"] = budget
    test_df.at[0 , "gross"] = gross
    test_df.at[0 , "runtime"] = runtime
    displaying the test dataframe
    st.dataframe(test_df)
    # making prediction using the loaded model
    result = Model.predict(test_df)[0]
    return result

# main function to create the Streamlit app
def main():
    # setting the app title
    st.title("Movie Success Predictor")
    # creating input widgets for user input
    rating = st.selectbox("Rating" , ['adults', 'adults guidance', 'all audiance', 'not rated'])
    genre = st.selectbox("Genre" , ['Drama', 'Adventure', 'Action', 'Comedy', 'Horror', 'Biography',
       'Crime', 'Other', 'Animation'])
    votes = st.slider("Votes" , min_value= 0 , max_value=2400000.0 , value=0,step=100)
    year = st.slider("Year" , min_value= 1980 , max_value=2020 , value=0,step=1)
    country = st.selectbox("Country" ,['United Kingdom', 'United States', 'Other', 'Canada', 'France',
       'Germany'] )
    rest_type_counts = st.selectbox("Number of Restaurant Type " , [1,2])
    budget = st.slider( "Budget" , min_value = 3000.0 , max_value = 356000000.0 , value = 0 , step = 1000)
    gross = st.slider( "Gross" , min_value = 309.0 , max_value = 356000000.0 , value = 0 , step = 1000)
    runtime = st.slider( "Runtime" , min_value = 63.0 , max_value = 366.0 , value = 0 , step = 5)
    # predict button
    if st.button("Predict"):
        # calling the prediction function with user inputs
        results = prediction(rating, genre, year, votes, country, budget, gross, runtime)
        label = ["Unsuccessful" , "Successful"]
        st.text(f"The Movie will be {label[results]}.")
        
if __name__ == '__main__':
    main()    
    

Writing streamlit_app.py
