In [4]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import numpy as np
import ast
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import nltk
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import re
import heapq
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

plt.style.use('seaborn-v0_8')
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

df = pd.read_json('movies.json', orient='records')

In [9]:
class GenreTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.genre_map = {
            28: 'Action',
            12: 'Adventure',
            16: 'Animation',
            35: 'Comedy',
            80: 'Crime',
            99: 'Documentary',
            18: 'Drama',
            10751: 'Family',
            14: 'Fantasy',
            36: 'History ',
            27: 'Horror',
            10402: 'Music',
            9648: 'Mystery',
            10749: 'Romance',
            878: 'ScienceFiction',
            10770: 'TvMovie',
            53: 'Thriller',
            10752: 'War',
            37: 'Western'
        }

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        
        X_modified = X.copy(deep=True);
        genres = list(self.genre_map.values())
        X_modified[list(genres)] = 0
        
        for i in range(len(X_modified)):
            genres = ast.literal_eval(X_modified.iloc[i]['GenreIds'])
            for genre in genres:
                X_modified.loc[i, self.genre_map[genre]] = 1
        
        # dropping the GenreIds
        X_modified.drop('GenreIds', axis=1, inplace=True)
        X_modified.drop('Id', axis=1, inplace=True)

        # eliminating the column ProductionCompanies
        X_modified.drop('ProductionCompanies', axis=1, inplace=True)

        return X_modified

# ---------------------------------------------------------------

class LanguageTransformer(BaseEstimator, TransformerMixin):

    def __init__(self) -> None:
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X_modified = X.copy(deep=True);
        language_counts = X_modified['OriginalLanguage'].value_counts()
        X_modified['OriginalLanguage'] = X_modified['OriginalLanguage'].map(language_counts)
        X_modified.drop('SpokenLanguages', axis=1, inplace=True)
        return X_modified

# ---------------------------------------------------------------

class CountryTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.country_region = {
            'United States of America': 'North America',
            'Italy': 'Europe',
            'Spain': 'Europe',
            'South Korea': 'Asia',
            'China': 'Asia',
            'India': 'Asia',
            'Canada': 'North America',
            'France': 'Europe',
            'Australia': 'Oceania',
            'United Kingdom': 'Europe',
            'Germany': 'Europe',
            'Brazil': 'South America',
            'Mexico': 'North America',
            'Japan': 'Asia',
            'Russia': 'Europe',
            'Sweden': 'Europe',
            'United Arab Emirates': 'Asia',
            'Nigeria': 'Africa',
            'Poland': 'Europe',
            'Serbia': 'Europe',
            'Ukraine': 'Europe',
            'Thailand': 'Asia',
            'Finland': 'Europe',
            'Norway': 'Europe',
            'Switzerland': 'Europe',
            'Bangladesh': 'Asia',
            'Austria': 'Europe',
            'Kazakhstan': 'Asia',
            'Belgium': 'Europe',
            'Hong Kong': 'Asia',
            'Cyprus': 'Europe',
            'Greece': 'Europe',
            'Denmark': 'Europe',
            'Ireland': 'Europe',
            'New Zealand': 'Oceania',
            'Chile': 'South America',
            'Philippines': 'Asia',
            'Singapore': 'Asia',
            'Taiwan': 'Asia',
            'Puerto Rico': 'North America',
            'Iceland': 'Europe',
            'Argentina': 'South America',
            'Czech Republic': 'Europe',
            'Colombia': 'South America',
            'Peru': 'South America',
            'Bulgaria': 'Europe',
            'Netherlands': 'Europe',
            'Hungary': 'Europe',
            'South Africa': 'Africa',
            'Latvia': 'Europe',
            'Dominican Republic': 'North America',
            'Uruguay': 'South America',
            'Venezuela': 'South America',
            'Malta': 'Europe',
            'Turkey': 'Asia',
            'Saudi Arabia': 'Asia',
            'Portugal': 'Europe',
            'Morocco': 'Africa',
            'Slovenia': 'Europe',
            'Israel': 'Asia',
            'Luxembourg': 'Europe',
            'Indonesia': 'Asia',
            'Panama': 'North America',
            'Bolivia': 'South America',
            'Romania': 'Europe',
            'Guadaloupe': 'North America',
            'Iran': 'Asia',
            'Costa Rica': 'North America',
            'Honduras': 'North America',
            'Albania': 'Europe',
            'Jordan': 'Asia',
            'Pakistan': 'Asia',
            'Lithuania': 'Europe',
            'Vietnam': 'Asia',
            'Malawi': 'Africa',
            'Soviet Union': 'Europe',
            'Estonia': 'Europe',
            'Botswana': 'Africa',
            'Paraguay': 'South America',
            'Yugoslavia': 'Europe',
            'Georgia': 'Asia',
            'Slovakia': 'Europe',
            'Malaysia': 'Asia',
            'Mauritius': 'Africa',
            'Guatemala': 'North America',
            'Macao': 'Asia',
            'Jamaica': 'North America',
            'Lebanon': 'Asia',
            'Qatar': 'Asia',
            'Zimbabwe': 'Africa',
            'Egypt': 'Africa',
            'Senegal': 'Africa',
            'Czechoslovakia': 'Europe',
            'East Germany': 'Europe',
            'Kenya': 'Africa',
            'Solomon Islands': 'Oceania',
            'Cambodia': 'Asia',
            'Iraq': 'Asia',
            'Tunisia': 'Africa',
            'Ecuador': 'South America',
            'Croatia': 'Europe',
            'Liechtenstein': 'Europe',
            'Namibia': 'Africa',
            'Ghana': 'Africa',
            'Bahamas': 'North America',
            'Aruba': 'North America',
            'Moldova': 'Europe'
        }

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X_modified = X.copy(deep=True);
        X_modified['Country'] = X_modified['ProductionCountries'].apply(lambda x: ast.literal_eval(x)[0]['name'] if x and ast.literal_eval(x) else None)
        X_modified['Region'] = X_modified['Country'].map(self.country_region)

        X_modified['Region'].fillna(X_modified['Region'].mode().iloc[0], inplace=True)

        X_modified.drop(['ProductionCountries', 'Country'], axis=1, inplace=True)
        X_modified.dropna(subset=['Overview'], inplace=True)
        X_modified.dropna(subset=['ReleaseDate'], inplace=True)
        X_modified['ReleaseDate'] = pd.to_datetime(X_modified['ReleaseDate'])
        
        return X_modified

# ---------------------------------------------------------------

class OtherStepsTransformer(BaseEstimator, TransformerMixin):

    def __init__(self) -> None:
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X_modified = X.copy(deep=True);
        # Feature Engineer columns from ReleaseDate
        X_modified['ReleaseYear'] = X_modified['ReleaseDate'].dt.year
        X_modified['ReleaseMonth'] = X_modified['ReleaseDate'].dt.month

        def map_to_decade(year):
            if np.isnan(year):  # Handling NaN values
                return np.nan
            return int(10 * (year // 10))

        # Apply the function to create a new column 'Decade'
        X_modified['Decade'] = X_modified['ReleaseYear'].apply(map_to_decade)

        X_modified.drop('ReleaseDate', axis=1, inplace=True)

        
        return X_modified
    
# ---------------------------------------------------------------

class ImputeBugetTransformer(BaseEstimator, TransformerMixin):

    def __init__(self) -> None:
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X_modified = X.copy(deep=True);
        # Simple linear regression to impute the rows with 0 Budget using VoteCount

        df_with_zeros = X_modified[X_modified['Budget'] == 0]
        df_without_zeros = X_modified[X_modified['Budget'] > 0]

        X_train = df_without_zeros[['VoteCount']]
        y_train = df_without_zeros['Budget']

        X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

        model = LinearRegression()

        model.fit(X_train, y_train)

        predicted_budget = model.predict(df_with_zeros[['VoteCount']])

        X_modified.loc[df_with_zeros.index, 'Budget'] = predicted_budget

        return X_modified

# ---------------------------------------------------------------

class PopularityTransformer(BaseEstimator, TransformerMixin):

    def __init__(self) -> None:
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_modified = X.copy(deep=True);
        X_modified['Popularity_cat'] = np.digitize(X_modified['Popularity'], bins=[16, 30, 53])
        return X_modified

# ---------------------------------------------------------------

class TrimmedTransformer(BaseEstimator, TransformerMixin):

    def __init__(self) -> None:
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X_modified = X.copy(deep=True);
        # method to return a copy of a data frame after trimming the numerical valiables

        def get_trimmed_data(data, numerical_columns):
            result = data.copy()
            for col in numerical_columns:
                upper_q = np.quantile(data[col], 0.75)
                lower_q = np.quantile(data[col], 0.25)
                IQR = upper_q - lower_q
                upper_whisker = upper_q + (1.5 * IQR)
                lower_whisker = lower_q - (1.5 * IQR)
                indexes = result[(result[col] > upper_whisker) | (result[col] < lower_whisker)].index
                result.drop(indexes, inplace=True)
            return result

        # Function to apply log tranformation to data, returns a copy with transformed columns
        def get_log_transformed_data(data, numerical_columns):
            result = data.copy()
            for col in numerical_columns:
                result[col] = result[col].map(lambda x: np.log(x) if x > 0 else 0)
            return result

        movies_data_trimmed = get_trimmed_data(X_modified, ['RunTime'])
        X_modified = get_log_transformed_data(movies_data_trimmed, ['Budget'])

        return X_modified

# ---------------------------------------------------------------

class SummarizeOverviewTransformer(BaseEstimator, TransformerMixin):

    def __init__(self) -> None:
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X_modified = X.copy(deep=True);
        # Method to summarize the overview column
        def summarize_overview(overview_text):
            # Check if the overview_text is a valid string
            if isinstance(overview_text, str):
                # Preprocessing the data
                clean_text = re.sub(r'\W', ' ', overview_text.lower())
                clean_text = re.sub(r'\d', ' ', clean_text)
                clean_text = re.sub(r'\s+', ' ', clean_text)

                # Tokenize sentences
                sentences = nltk.sent_tokenize(clean_text)

                # Stopword list
                stop_words = nltk.corpus.stopwords.words('english')

                # Word counts
                word2count = {}
                for word in nltk.word_tokenize(clean_text):
                    if word not in stop_words:
                        if word not in word2count.keys():
                            word2count[word] = 1
                        else:
                            word2count[word] += 1

                # Converting counts to weights
                max_count = max(word2count.values())
                for key in word2count.keys():
                    word2count[key] = word2count[key] / max_count

                # Product sentence scores
                sent2score = {}
                for sentence in sentences:
                    for word in nltk.word_tokenize(sentence.lower()):
                        if word in word2count.keys():
                            if len(sentence.split(' ')) < 25:
                                if sentence not in sent2score.keys():
                                    sent2score[sentence] = word2count[word]
                                else:
                                    sent2score[sentence] += word2count[word]

                # Find the top sentence to use as a summary
                if sent2score:
                    summary_sentence = heapq.nlargest(1, sent2score, key=sent2score.get)[0]
                    return summary_sentence
                else:
                    return clean_text  # Use the entire cleaned text as the summary

            else:
                return overview_text  # If not a string, return the original value

        # Apply the summarization function to rows with null 'TagLine'
        null_tagline_rows = X_modified[X_modified['TagLine'].isnull()]

        # Apply the summarize_overview function to 'Overview'
        summaries = X_modified.loc[null_tagline_rows.index, 'Overview'].apply(summarize_overview)

        # Replace null values in 'TagLine' with the summarized values
        X_modified.loc[null_tagline_rows.index, 'TagLine'] = summaries

        # Removing OriginalTitle as it is same as Title but in original language
        X_modified.drop(['OriginalTitle'], axis=1, inplace=True)

        return X_modified

# ---------------------------------------------------------------

class RemoveColumnsTransformer(BaseEstimator, TransformerMixin):

    def __init__(self) -> None:
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_modified = X.copy(deep=True);
        X_modified.drop(['Overview', 'Title', 'TagLine'], axis=1, inplace=True)
        return X_modified

# ---------------------------------------------------------------

class ScaleTransformer(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.standardscalerBudget = MinMaxScaler()
        self.standardscalerRunTime = MinMaxScaler()
        self.standardscalerRevenue = MinMaxScaler()
        self.onehotencoder = OneHotEncoder()

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_modified = X.copy(deep=True)
        
        encoded = self.onehotencoder.fit_transform(X_modified[['Region']]).toarray()
        encoded_df = pd.DataFrame(encoded, columns=self.onehotencoder.get_feature_names_out(['Region']), index=X_modified.index)
        X_modified = pd.concat([X_modified, encoded_df], axis=1)
        X_modified.drop(['Region'], axis=1, inplace=True)


        X_modified[['Budget']] = self.standardscalerBudget.fit_transform(X_modified[['Budget']])
        X_modified[['RunTime']] = self.standardscalerRunTime.fit_transform(X_modified[['RunTime']])
        X_modified[['Revenue']] = self.standardscalerRevenue.fit_transform(X_modified[['Revenue']])
        return X_modified

# --------------------------------------------------------------

column_transformer = ColumnTransformer(transformers=[
    ('num', Pipeline(steps=[ ('scaler', MinMaxScaler()) ]), ['Budget', 'RunTime', 'Revenue']),
    ('cat', Pipeline(steps=[ ('one-hot', OneHotEncoder()) ]), ['Region'])
])

pipeline = Pipeline([
    ('genre_transformer', GenreTransformer()),
    ('language_column', LanguageTransformer()),
    ('country_transformer', CountryTransformer()),
    ('OtherStepsTransformer', OtherStepsTransformer()),
    ('ImputeBugetTransformer', ImputeBugetTransformer()),
    ('PopularityTransformer', PopularityTransformer()),
    ('TrimmedTransformer', TrimmedTransformer()),
    ('SummarizeOverviewTransformer', SummarizeOverviewTransformer()),
    ('RemoveColumnsTransformer', RemoveColumnsTransformer()),
    ('column_transformer', ScaleTransformer())
])

df = pd.read_json('movies.json', orient='records')
transformed_df = pipeline.fit_transform(df)

y = transformed_df['Popularity_cat']
X = transformed_df.drop('Popularity_cat', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [11]:
# Import libraries
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, make_scorer, average_precision_score, classification_report

In [12]:

svc_clf = SVC(probability=True, random_state=42, decision_function_shape='ovo')
svc_clf.fit(X_train, y_train)

In [13]:
y_test_pred = svc_clf.predict(X_test)
y_train_pred = svc_clf.predict(X_train)

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred, average='weighted')
recall = recall_score(y_test, y_test_pred, average='weighted')
f1 = f1_score(y_test, y_test_pred, average='weighted')

print('accuracy', accuracy);
print('precision', precision);
print('recall', recall);
print('f1', f1);

accuracy 0.49270664505672607
precision 0.3499118752655963
recall 0.49270664505672607
f1 0.3651669425550771


In [19]:
df_new = pd.read_json('movies.json', orient='records')
# transformed_df = pipeline.fit_transform(df)

# y = transformed_df['Popularity_cat']
# X = transformed_df.drop('Popularity_cat', axis=1)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
df_new.head(1)

Unnamed: 0,GenreIds,Id,OriginalLanguage,OriginalTitle,Overview,Popularity,ReleaseDate,Title,VoteAverage,VoteCount,Budget,ProductionCompanies,ProductionCountries,SpokenLanguages,TagLine,RunTime,Revenue
0,"[28, 12, 53]",299054,en,Expend4bles,Armed with every weapon they can get their han...,3741.062,2023-09-15,Expend4bles,6.4,364,100000000,"[{'id': 1020, 'logo_path': '/kuUIHNwMec4dwOLgh...","[{'iso_3166_1': 'US', 'name': 'United States o...","[{'english_name': 'English', 'iso_639_1': 'en'...",They'll die when they're dead.,103,30000000


In [33]:
new_df = pd.DataFrame({
    'GenreIds': ['[28, 12, 53]'],
    'Id': [0],
    'OriginalLanguage': ['en'],
    'OriginalTitle': ['Expend4bles'], 
    'Overview': ['Armed with every weapon they can get their han'],
    'Popularity': ['3741.062'], 
    'ReleaseDate': ['2023-09-15'], 
    'Title': ['Expend4bles'], 
    'VoteAverage': [6.4],
    'VoteCount': [364],
    'Budget': [100000000],
    'ProductionCompanies': ['[]'],
    'ProductionCountries': ['[]'],
    'SpokenLanguages': ['[english]'],
    'TagLine': ["They'll die when they're dead."],
    'RunTime': [103],
    'Revenue': [30000000],
    'Region': ['North America'],
})
new_transformed = pipeline.transform(new_df)
y_new_data = svc_clf.predict(new_transformed.drop('Revenue', axis=1))

IndexError: single positional indexer is out-of-bounds