In [1]:
import numpy as np
import pandas as pd
import ast
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")

In [2]:
def genre(txt):
    txt = ast.literal_eval(txt)
    ans = []
    for i in txt:
        ans.append(i["name"])
    return ans

In [3]:
def cast(txt):
    ans = []
    for i in txt:
        ans.append(i["name"])
    return ans

In [4]:
def keywords(txt):
    k = ast.literal_eval(txt)
    ans = []
    for i in k["keywords"]:
        ans.append(i["name"])
    return ans

In [35]:
df = pd.read_csv("../data/interim/new_movies_full.csv")

In [36]:
def seprate(col_name, n=20, func=genre):
    df[col_name] = df[col_name].apply(func)
    dct = {}
    for i in df[col_name].values:
        for j in i:
            dct[j] = dct.get(j, 0) + 1
    sorted_comp = sorted(dct, key=lambda a: 1 / dct[a])[:n]
    for i in tqdm(sorted_comp):
        df[i] = df[col_name].apply(lambda x: 1 if i in x else 0)
    df.drop(columns=[col_name], axis=1, inplace=True)

# Creating dataset for story based recommendation

In [37]:
df.fillna("", inplace=True)

In [38]:
import string
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk

punc = string.punctuation
nltk.download("stopwords")
stpWrd = stopwords.words("english")


def preprocessText(txt):
    new_txt = ""
    for i in txt:
        if i not in punc:
            new_txt += i

    ps = PorterStemmer()
    new_txt2 = []
    for word in new_txt.split():
        if word not in stpWrd and word.isalnum():
            new_txt2.append(ps.stem(word))

    return " ".join(new_txt2)


df["original_title"] = df["original_title"].apply(preprocessText)
df["tagline"] = df["tagline"].apply(preprocessText)
df["overview"] = df["overview"].apply(preprocessText)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\panka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
seprate("genres")

100%|██████████| 19/19 [00:00<00:00, 163.45it/s]


In [40]:
seprate("keywords", 100, keywords)

100%|██████████| 100/100 [00:00<00:00, 142.89it/s]


In [41]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tf1 = TfidfVectorizer(max_features=100)
tf2 = TfidfVectorizer(max_features=100)
tf3 = TfidfVectorizer(max_features=300)

X_title = tf1.fit_transform(df["original_title"])
X_tagline = tf2.fit_transform(df["tagline"])
X_overview = tf3.fit_transform(df["overview"])

In [42]:
df = pd.concat(
    [
        df.reset_index(drop=True),
        pd.DataFrame(X_title.toarray(), columns=tf1.get_feature_names_out() + "_title"),
    ],
    axis=1,
)
df = pd.concat(
    [
        df.reset_index(drop=True),
        pd.DataFrame(
            X_tagline.toarray(), columns=tf2.get_feature_names_out() + "_tagline"
        ),
    ],
    axis=1,
)
df = pd.concat(
    [
        df.reset_index(drop=True),
        pd.DataFrame(
            X_overview.toarray(), columns=tf3.get_feature_names_out() + "_over"
        ),
    ],
    axis=1,
)

In [43]:
df.drop(columns=["original_title", "overview", "release_date", "tagline"], inplace=True)

In [44]:
df.to_csv("../data/interim/story.csv")

# Creating dataset for Cast and Crew based recommendation

In [23]:
df = pd.read_csv("../data/interim/new_movies_full.csv")

In [24]:
seprate("production_companies", 50)

100%|██████████| 50/50 [00:00<00:00, 99.20it/s] 


In [25]:
seprate("production_countries", 50)

100%|██████████| 50/50 [00:00<00:00, 116.09it/s]


In [26]:
seprate("cast", 200, genre)

100%|██████████| 200/200 [00:05<00:00, 39.19it/s]


In [27]:
seprate("crew", 200, genre)

100%|██████████| 200/200 [00:10<00:00, 18.28it/s]


In [None]:
df.to_csv("../data/interim/cast.csv")

# Creating dataset for Scale based recommendation

In [46]:
df = pd.read_csv("../data/interim/new_movies_full.csv")

In [47]:
df.to_csv("../data/interim/scale.csv")

# Creating dataset for using it in frontend for Recommend

In [64]:
df = pd.read_csv("../data/interim/new_movies_full.csv")

In [65]:
df["release_date"] = pd.to_datetime(df["release_date"])
df["release_year"] = df["release_date"].dt.year
df["release_day"] = df["release_date"].dt.day_name()
df["release_month"] = df["release_date"].dt.month_name()

In [66]:
seprate("genres")

100%|██████████| 19/19 [00:00<00:00, 174.73it/s]


In [67]:
seprate("keywords", 50, keywords)

100%|██████████| 50/50 [00:00<00:00, 136.37it/s]


In [68]:
seprate("production_companies", 50)

100%|██████████| 50/50 [00:00<00:00, 178.68it/s]


In [69]:
seprate("production_countries", 50)

100%|██████████| 50/50 [00:00<00:00, 176.85it/s]


In [70]:
seprate("spoken_languages", 20)

100%|██████████| 20/20 [00:00<00:00, 213.19it/s]


In [71]:
seprate("cast", 50, genre)

100%|██████████| 50/50 [00:00<00:00, 61.60it/s]


In [72]:
seprate("crew", 50, genre)

100%|██████████| 50/50 [00:01<00:00, 31.22it/s]


In [None]:
df.to_csv("../data/processed/Frontend.csv")

# Dataset to be used in backend

In [59]:
df = pd.read_csv("../data/interim/new_movies_full.csv")

In [60]:
df["release_date"] = pd.to_datetime(df["release_date"])
df["release_year"] = df["release_date"].dt.year

In [61]:
df = df[["release_year", "title", "poster_path"]]

In [62]:
df.to_csv("../data/processed/backend.csv")