In [None]:
from sklearn.neighbors import NearestNeighbors
import random
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score,precision_score,recall_score,f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer, PolynomialFeatures
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import skfuzzy as fuzz
from scipy.sparse import hstack

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Load movie data
movie_data = pd.read_csv("movies/updatedmoviedata.csv", usecols=['Series_Title', 'Certificate', 'Genre', 'rating', 'Overview', 'Director', 'Cast', 'votes'], low_memory=False, encoding='ISO-8859-1')

# Fill NaN values with empty strings
movie_data[['Genre', 'Overview', 'Director', 'Cast']] = movie_data[['Genre', 'Overview', 'Director', 'Cast']].fillna('').astype(str)

# Ensure 'votes' is numeric and handle NaN values
movie_data['votes'] = pd.to_numeric(movie_data['votes'], errors='coerce')
movie_data.fillna({'votes':movie_data['votes'].median()}, inplace=True)
movie_data['votes'] = np.log1p(movie_data['votes'])  # Apply log transformation

# Convert 'rating' to numeric and fill NaNs with median
movie_data['rating'] = pd.to_numeric(movie_data['rating'], errors='coerce')
movie_data.fillna({'rating':movie_data['rating'].median()}, inplace=True)

# Calculate a popularity score
movie_data['Popularity_Score'] = (movie_data['votes'] / movie_data['votes'].max()) * movie_data['rating']
print(movie_data['Popularity_Score'])
# Combine relevant features for TF-IDF processing
movie_data['combined_features'] = movie_data['Genre'] + ' ' + movie_data['Overview'] + ' ' + movie_data['Director'] + ' ' + movie_data['Cast']

# TF-IDF Vectorizer with increased max_features
tfidf = TfidfVectorizer(stop_words='english', max_features=2500)
feature_vectors = tfidf.fit_transform(movie_data['combined_features'])

# Normalize 'votes' column
scaler = MinMaxScaler()
movie_data['scaled_votes'] = scaler.fit_transform(movie_data[['votes']])

# Apply Polynomial Features to numeric data
'''poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
numeric_features = poly.fit_transform(movie_data[['scaled_votes', 'rating', 'Popularity_Score']])'''

# TF-IDF Vectorizer for KNN
tfidf = TfidfVectorizer(stop_words='english')
feature_vectors = tfidf.fit_transform(movie_data['combined_features'])

# KNN Model for Similar Movies
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(feature_vectors)

def get_similar_movies(genre, cast, director, plot, top_n=10):
    input_text = genre + ' ' + cast + ' ' + director + ' ' + plot
    input_text_vector = tfidf.transform([input_text])
    distances, indices = knn_model.kneighbors(input_text_vector, n_neighbors=top_n)
    return movie_data.iloc[indices[0]][['Series_Title', 'Genre', 'Director', 'Cast', 'Overview', 'rating', 'votes', 'Popularity_Score']]
def convert_ratings_to_categories(ratings):
    """Convert continuous ratings to categorical labels based on defined thresholds."""
    categories = []
    for rating in ratings:
        if rating < 5:
            categories.append('Flop')
        elif 5 <= rating < 7:
            categories.append('Average')
        elif 7 <= rating < 9:
            categories.append('Hit')
        else:  # rating >= 9
            categories.append('Superhit')
    return np.array(categories)
# Genre-Based Audience Classification
audience_genre_map = {
    "Juniors": {"Animation", "Fantasy", "Adventure"},
    "Teenagers": {"Drama", "Romance", "Action", "Sci-Fi", "Horror"},
    "Mid-age": {"Thriller", "Horror", "Family", "Romance", "Biography", "Crime", "War", "Mystery", "Comedy"},
    "Seniors": {"Family", "Comedy", "Biography"}
}

def classify_target_audience(genres):
    genres = set(genres.split(", "))
    audience_scores = {group: len(genres & genre_set) for group, genre_set in audience_genre_map.items()}
    return max(audience_scores, key=audience_scores.get)  # Highest matching category

movie_data["Target_Audience"] = movie_data["Genre"].apply(classify_target_audience)

# Encode audience categories
audience_labels = {"Juniors": 0, "Teenagers": 1, "Mid-age": 2, "Seniors": 3}
movie_data['Audience_Label'] = movie_data['Target_Audience'].map(audience_labels)

# Apply Fuzzy C-Means Clustering
features = movie_data[['Popularity_Score', 'rating', 'Audience_Label']]
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(features)

cntr, u, _, _, _, _, _ = fuzz.cluster.cmeans(
    normalized_features.T, c=4, m=2, error=0.005, maxiter=1000, init=None
)

# Assign Fuzzy Clusters
cluster_labels = np.argmax(u, axis=0)
movie_data['Fuzzy_Audience'] = [list(audience_labels.keys())[i] for i in cluster_labels]

    
def predict_movie_success(genre, cast, director, overview):
    similar_movies = get_similar_movies(genre, cast, director, overview, top_n=75)
    similar_movies['Genre'] = similar_movies['Genre'].str.split(', ')
    similar_movies['Cast'] = similar_movies['Cast'].str.split(', ')
    similar_movies['Director'] = similar_movies['Director'].str.split(', ')
    
    mlb_genre, mlb_cast, mlb_director = MultiLabelBinarizer(), MultiLabelBinarizer(), MultiLabelBinarizer()
    encoded_genre = mlb_genre.fit_transform(similar_movies['Genre'])
    encoded_cast = mlb_cast.fit_transform(similar_movies['Cast'])
    encoded_director = mlb_director.fit_transform(similar_movies['Director'])
    
    encoded_categorical = np.hstack([encoded_genre, encoded_cast, encoded_director])
    tfidf_vectorizer = TfidfVectorizer( stop_words='english')
    tfidf_overview = tfidf_vectorizer.fit_transform(similar_movies['Overview']).toarray()
    
    X = np.hstack([encoded_categorical, tfidf_overview])
    y = similar_movies['rating'].values.reshape(-1, 1)
    y_normalized = scaler.fit_transform(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_normalized, test_size=0.2, random_state=0)
    xg_reg = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse',verbosity=1)
    xg_reg.fit(X_train, y_train)
    y_train_pred = xg_reg.predict(X_train)
    y_test_pred = xg_reg.predict(X_test)
    input_genre, input_cast, input_director = [genre.split(', ')], [cast.split(', ')], [director.split(', ')]
    input_categorical_vector = np.hstack([
        mlb_genre.transform(input_genre), mlb_cast.transform(input_cast), mlb_director.transform(input_director)
    ])
    input_features = np.hstack([input_categorical_vector, tfidf_vectorizer.transform([overview]).toarray()])
    
    predicted_rating = xg_reg.predict(input_features)
    predicted_rating = scaler.inverse_transform(predicted_rating.reshape(-1, 1))[0][0]
    predicted_audience = classify_target_audience(genre)
    y_test_categories = convert_ratings_to_categories(scaler.inverse_transform(y_test))
    y_test_pred_categories = convert_ratings_to_categories(scaler.inverse_transform(y_test_pred.reshape(-1,1)))
    accuracy = accuracy_score(y_test_categories, y_test_pred_categories)
    precision = precision_score(y_test_categories, y_test_pred_categories, average='weighted', zero_division=0)
    recall = recall_score(y_test_categories, y_test_pred_categories, average='weighted', zero_division=0)
    f1 = f1_score(y_test_categories, y_test_pred_categories, average='weighted', zero_division=0)

# Print the metrics
    print(f'Accuracy: {round(accuracy,2)}')
    print(f'Precision: {round(precision,2)}')
    print(f'Recall: {round(recall,2)}')
    print(f'F1-Score: {round(f1,2)}')
    return predicted_rating, predicted_audience
# Example usage
genre = "Action, Thriller"
director="Haneef Adeni"
cast = "Unni Mukundan, Siddique, Jagadish"
overview="Victor, a blind man, witnesses the murder of his close friend Wasim. Despite his blindness, Victor identifies the killer by recognising the distinct scent of the killer's perfume and vehicle. Victor's identification leads the cops to suspect that the killer is Russell, who conspired with Wasim's elder brother Tariq, to carry out the murder. "

predicted_rating, predicted_audience = predict_movie_success(genre, cast, director, overview)

print(f"Predicted Rating: {predicted_rating}")
print(f"Predicted Target Audience: {predicted_audience}")